1#!/usr/bin/env python
2
3'''
4The sample demonstrates how to train Random Trees classifier
5(or Boosting classifier, or MLP, or Knearest, or Support Vector Machines) using the provided dataset.
6
7We use the sample database letter-recognition.data
8from UCI Repository, here is the link:
9
10Newman, D.J. & Hettich, S. & Blake, C.L. & Merz, C.J. (1998).
11UCI Repository of machine learning databases
12[http://www.ics.uci.edu/~mlearn/MLRepository.html].
13Irvine, CA: University of California, Department of Information and Computer Science.
14
15The dataset consists of 20000 feature vectors along with the
16responses - capital latin letters A..Z.
17The first 10000 samples are used for training
18and the remaining 10000 - to test the classifier.
19======================================================
20USAGE:
21  letter_recog.py [--model <model>]
22                  [--data <data fn>]
23                  [--load <model fn>] [--save <model fn>]
24
25  Models: RTrees, KNearest, Boost, SVM, MLP
26'''
27
28import numpy as np
29import cv2
30
31def load_base(fn):
32    a = np.loadtxt(fn, np.float32, delimiter=',', converters={ 0 : lambda ch : ord(ch)-ord('A') })
33    samples, responses = a[:,1:], a[:,0]
34    return samples, responses
35
36class LetterStatModel(object):
37    class_n = 26
38    train_ratio = 0.5
39
40    def load(self, fn):
41        self.model.load(fn)
42    def save(self, fn):
43        self.model.save(fn)
44
45    def unroll_samples(self, samples):
46        sample_n, var_n = samples.shape
47        new_samples = np.zeros((sample_n * self.class_n, var_n+1), np.float32)
48        new_samples[:,:-1] = np.repeat(samples, self.class_n, axis=0)
49        new_samples[:,-1] = np.tile(np.arange(self.class_n), sample_n)
50        return new_samples
51
52    def unroll_responses(self, responses):
53        sample_n = len(responses)
54        new_responses = np.zeros(sample_n*self.class_n, np.int32)
55        resp_idx = np.int32( responses + np.arange(sample_n)*self.class_n )
56        new_responses[resp_idx] = 1
57        return new_responses
58
59class RTrees(LetterStatModel):
60    def __init__(self):
61        self.model = cv2.RTrees()
62
63    def train(self, samples, responses):
64        sample_n, var_n = samples.shape
65        var_types = np.array([cv2.CV_VAR_NUMERICAL] * var_n + [cv2.CV_VAR_CATEGORICAL], np.uint8)
66        #CvRTParams(10,10,0,false,15,0,true,4,100,0.01f,CV_TERMCRIT_ITER));
67        params = dict(max_depth=10 )
68        self.model.train(samples, cv2.CV_ROW_SAMPLE, responses, varType = var_types, params = params)
69
70    def predict(self, samples):
71        return np.float32( [self.model.predict(s) for s in samples] )
72
73
74class KNearest(LetterStatModel):
75    def __init__(self):
76        self.model = cv2.KNearest()
77
78    def train(self, samples, responses):
79        self.model.train(samples, responses)
80
81    def predict(self, samples):
82        retval, results, neigh_resp, dists = self.model.find_nearest(samples, k = 10)
83        return results.ravel()
84
85
86class Boost(LetterStatModel):
87    def __init__(self):
88        self.model = cv2.Boost()
89
90    def train(self, samples, responses):
91        sample_n, var_n = samples.shape
92        new_samples = self.unroll_samples(samples)
93        new_responses = self.unroll_responses(responses)
94        var_types = np.array([cv2.CV_VAR_NUMERICAL] * var_n + [cv2.CV_VAR_CATEGORICAL, cv2.CV_VAR_CATEGORICAL], np.uint8)
95        #CvBoostParams(CvBoost::REAL, 100, 0.95, 5, false, 0 )
96        params = dict(max_depth=5) #, use_surrogates=False)
97        self.model.train(new_samples, cv2.CV_ROW_SAMPLE, new_responses, varType = var_types, params=params)
98
99    def predict(self, samples):
100        new_samples = self.unroll_samples(samples)
101        pred = np.array( [self.model.predict(s, returnSum = True) for s in new_samples] )
102        pred = pred.reshape(-1, self.class_n).argmax(1)
103        return pred
104
105
106class SVM(LetterStatModel):
107    def __init__(self):
108        self.model = cv2.SVM()
109
110    def train(self, samples, responses):
111        params = dict( kernel_type = cv2.SVM_LINEAR,
112                       svm_type = cv2.SVM_C_SVC,
113                       C = 1 )
114        self.model.train(samples, responses, params = params)
115
116    def predict(self, samples):
117        return self.model.predict_all(samples).ravel()
118
119
120class MLP(LetterStatModel):
121    def __init__(self):
122        self.model = cv2.ANN_MLP()
123
124    def train(self, samples, responses):
125        sample_n, var_n = samples.shape
126        new_responses = self.unroll_responses(responses).reshape(-1, self.class_n)
127
128        layer_sizes = np.int32([var_n, 100, 100, self.class_n])
129        self.model.create(layer_sizes)
130
131        # CvANN_MLP_TrainParams::BACKPROP,0.001
132        params = dict( term_crit = (cv2.TERM_CRITERIA_COUNT, 300, 0.01),
133                       train_method = cv2.ANN_MLP_TRAIN_PARAMS_BACKPROP,
134                       bp_dw_scale = 0.001,
135                       bp_moment_scale = 0.0 )
136        self.model.train(samples, np.float32(new_responses), None, params = params)
137
138    def predict(self, samples):
139        ret, resp = self.model.predict(samples)
140        return resp.argmax(-1)
141
142
143if __name__ == '__main__':
144    import getopt
145    import sys
146
147    print  __doc__
148
149    models = [RTrees, KNearest, Boost, SVM, MLP] # NBayes
150    models = dict( [(cls.__name__.lower(), cls) for cls in models] )
151
152
153    args, dummy = getopt.getopt(sys.argv[1:], '', ['model=', 'data=', 'load=', 'save='])
154    args = dict(args)
155    args.setdefault('--model', 'rtrees')
156    args.setdefault('--data', '../data/letter-recognition.data')
157
158    print 'loading data %s ...' % args['--data']
159    samples, responses = load_base(args['--data'])
160    Model = models[args['--model']]
161    model = Model()
162
163    train_n = int(len(samples)*model.train_ratio)
164    if '--load' in args:
165        fn = args['--load']
166        print 'loading model from %s ...' % fn
167        model.load(fn)
168    else:
169        print 'training %s ...' % Model.__name__
170        model.train(samples[:train_n], responses[:train_n])
171
172    print 'testing...'
173    train_rate = np.mean(model.predict(samples[:train_n]) == responses[:train_n])
174    test_rate  = np.mean(model.predict(samples[train_n:]) == responses[train_n:])
175
176    print 'train rate: %f  test rate: %f' % (train_rate*100, test_rate*100)
177
178    if '--save' in args:
179        fn = args['--save']
180        print 'saving model to %s ...' % fn
181        model.save(fn)
182    cv2.destroyAllWindows()
183