1#!/usr/bin/env python 2 3''' 4The sample demonstrates how to train Random Trees classifier 5(or Boosting classifier, or MLP, or Knearest, or Support Vector Machines) using the provided dataset. 6 7We use the sample database letter-recognition.data 8from UCI Repository, here is the link: 9 10Newman, D.J. & Hettich, S. & Blake, C.L. & Merz, C.J. (1998). 11UCI Repository of machine learning databases 12[http://www.ics.uci.edu/~mlearn/MLRepository.html]. 13Irvine, CA: University of California, Department of Information and Computer Science. 14 15The dataset consists of 20000 feature vectors along with the 16responses - capital latin letters A..Z. 17The first 10000 samples are used for training 18and the remaining 10000 - to test the classifier. 19====================================================== 20USAGE: 21 letter_recog.py [--model <model>] 22 [--data <data fn>] 23 [--load <model fn>] [--save <model fn>] 24 25 Models: RTrees, KNearest, Boost, SVM, MLP 26''' 27 28import numpy as np 29import cv2 30 31def load_base(fn): 32 a = np.loadtxt(fn, np.float32, delimiter=',', converters={ 0 : lambda ch : ord(ch)-ord('A') }) 33 samples, responses = a[:,1:], a[:,0] 34 return samples, responses 35 36class LetterStatModel(object): 37 class_n = 26 38 train_ratio = 0.5 39 40 def load(self, fn): 41 self.model.load(fn) 42 def save(self, fn): 43 self.model.save(fn) 44 45 def unroll_samples(self, samples): 46 sample_n, var_n = samples.shape 47 new_samples = np.zeros((sample_n * self.class_n, var_n+1), np.float32) 48 new_samples[:,:-1] = np.repeat(samples, self.class_n, axis=0) 49 new_samples[:,-1] = np.tile(np.arange(self.class_n), sample_n) 50 return new_samples 51 52 def unroll_responses(self, responses): 53 sample_n = len(responses) 54 new_responses = np.zeros(sample_n*self.class_n, np.int32) 55 resp_idx = np.int32( responses + np.arange(sample_n)*self.class_n ) 56 new_responses[resp_idx] = 1 57 return new_responses 58 59class RTrees(LetterStatModel): 60 def __init__(self): 61 self.model = cv2.RTrees() 62 63 def train(self, samples, responses): 64 sample_n, var_n = samples.shape 65 var_types = np.array([cv2.CV_VAR_NUMERICAL] * var_n + [cv2.CV_VAR_CATEGORICAL], np.uint8) 66 #CvRTParams(10,10,0,false,15,0,true,4,100,0.01f,CV_TERMCRIT_ITER)); 67 params = dict(max_depth=10 ) 68 self.model.train(samples, cv2.CV_ROW_SAMPLE, responses, varType = var_types, params = params) 69 70 def predict(self, samples): 71 return np.float32( [self.model.predict(s) for s in samples] ) 72 73 74class KNearest(LetterStatModel): 75 def __init__(self): 76 self.model = cv2.KNearest() 77 78 def train(self, samples, responses): 79 self.model.train(samples, responses) 80 81 def predict(self, samples): 82 retval, results, neigh_resp, dists = self.model.find_nearest(samples, k = 10) 83 return results.ravel() 84 85 86class Boost(LetterStatModel): 87 def __init__(self): 88 self.model = cv2.Boost() 89 90 def train(self, samples, responses): 91 sample_n, var_n = samples.shape 92 new_samples = self.unroll_samples(samples) 93 new_responses = self.unroll_responses(responses) 94 var_types = np.array([cv2.CV_VAR_NUMERICAL] * var_n + [cv2.CV_VAR_CATEGORICAL, cv2.CV_VAR_CATEGORICAL], np.uint8) 95 #CvBoostParams(CvBoost::REAL, 100, 0.95, 5, false, 0 ) 96 params = dict(max_depth=5) #, use_surrogates=False) 97 self.model.train(new_samples, cv2.CV_ROW_SAMPLE, new_responses, varType = var_types, params=params) 98 99 def predict(self, samples): 100 new_samples = self.unroll_samples(samples) 101 pred = np.array( [self.model.predict(s, returnSum = True) for s in new_samples] ) 102 pred = pred.reshape(-1, self.class_n).argmax(1) 103 return pred 104 105 106class SVM(LetterStatModel): 107 def __init__(self): 108 self.model = cv2.SVM() 109 110 def train(self, samples, responses): 111 params = dict( kernel_type = cv2.SVM_LINEAR, 112 svm_type = cv2.SVM_C_SVC, 113 C = 1 ) 114 self.model.train(samples, responses, params = params) 115 116 def predict(self, samples): 117 return self.model.predict_all(samples).ravel() 118 119 120class MLP(LetterStatModel): 121 def __init__(self): 122 self.model = cv2.ANN_MLP() 123 124 def train(self, samples, responses): 125 sample_n, var_n = samples.shape 126 new_responses = self.unroll_responses(responses).reshape(-1, self.class_n) 127 128 layer_sizes = np.int32([var_n, 100, 100, self.class_n]) 129 self.model.create(layer_sizes) 130 131 # CvANN_MLP_TrainParams::BACKPROP,0.001 132 params = dict( term_crit = (cv2.TERM_CRITERIA_COUNT, 300, 0.01), 133 train_method = cv2.ANN_MLP_TRAIN_PARAMS_BACKPROP, 134 bp_dw_scale = 0.001, 135 bp_moment_scale = 0.0 ) 136 self.model.train(samples, np.float32(new_responses), None, params = params) 137 138 def predict(self, samples): 139 ret, resp = self.model.predict(samples) 140 return resp.argmax(-1) 141 142 143if __name__ == '__main__': 144 import getopt 145 import sys 146 147 print __doc__ 148 149 models = [RTrees, KNearest, Boost, SVM, MLP] # NBayes 150 models = dict( [(cls.__name__.lower(), cls) for cls in models] ) 151 152 153 args, dummy = getopt.getopt(sys.argv[1:], '', ['model=', 'data=', 'load=', 'save=']) 154 args = dict(args) 155 args.setdefault('--model', 'rtrees') 156 args.setdefault('--data', '../data/letter-recognition.data') 157 158 print 'loading data %s ...' % args['--data'] 159 samples, responses = load_base(args['--data']) 160 Model = models[args['--model']] 161 model = Model() 162 163 train_n = int(len(samples)*model.train_ratio) 164 if '--load' in args: 165 fn = args['--load'] 166 print 'loading model from %s ...' % fn 167 model.load(fn) 168 else: 169 print 'training %s ...' % Model.__name__ 170 model.train(samples[:train_n], responses[:train_n]) 171 172 print 'testing...' 173 train_rate = np.mean(model.predict(samples[:train_n]) == responses[:train_n]) 174 test_rate = np.mean(model.predict(samples[train_n:]) == responses[train_n:]) 175 176 print 'train rate: %f test rate: %f' % (train_rate*100, test_rate*100) 177 178 if '--save' in args: 179 fn = args['--save'] 180 print 'saving model to %s ...' % fn 181 model.save(fn) 182 cv2.destroyAllWindows() 183