1 from PyML.utils import misc
2
3 from baseClassifiers import Classifier,IteratorClassifier
4 from composite import CompositeClassifier
5 import svm
6
7 '''classes for model selection'''
8
9 __docformat__ = "restructuredtext en"
10
11
12 -class Param (IteratorClassifier) :
13
14 """
15 A class for training a classifier with several values of a parameter.
16 Training trains a classifier for each value of the parameter.
17 Testing returns a list evaluating each trained classifier on the given
18 dataset.
19
20 Example::
21
22 p = Param(svm.SVM(), 'C', [0.1, 1, 10, 100, 1000])
23 """
24
25 - def __init__(self, arg, attribute = 'C', values = [0.1, 1, 10, 100, 1000]) :
26 """
27 :Parameters:
28 - `arg` - another Param object, or the classifier to be used
29 - `attribute` - the attribute of the classifier that needs tuning
30 - `values` - a list of values to try
31 """
32
33 if arg.__class__ == self.__class__ :
34 other = arg
35 self.attribute = other.attribute
36 self.values = other.values[:]
37 self.classifiers = [classifier.__class__(classifier)
38 for classifier in other.classifiers]
39 for i in range(len(self)) :
40 misc.mysetattr(self.classifiers[i], self.attribute, self.values[i])
41 elif hasattr(arg, 'type') and arg.type == 'classifier' :
42 self.attribute = attribute
43 self.values = values
44 self.classifiers = [arg.__class__(arg)
45 for i in range(len(self.values))]
46 for i in range(len(self)) :
47 misc.mysetattr(self.classifiers[i], self.attribute, self.values[i])
48 elif type(arg) == type([]) :
49 self.classifiers = [arg[i].__class__(arg[i])
50 for i in range(len(arg))]
51
55
57
58 rep = '<' + self.__class__.__name__ + ' instance>\n'
59 rep += 'classifier:\n'
60 rep += self.classifiers[0].__repr__()
61 rep += 'attribute: %s\n' % self.attribute
62 rep += 'values:' + str(self.values) + '\n'
63
64 return rep
65
66
67 - def train(self, data, **args) :
71
72
73
75 """
76 A class for training and testing a classifier on a grid of parameter
77 values for two attributes of the classifier.
78
79 Example::
80
81 p = ParamGrid(svm.SVM(ker.Gaussian()), 'C', [0.1, 1, 10, 100, 1000],
82 'kernel.gamma', [0.001, 0.01, 0.1, 1, 10])
83 """
84
85 - def __init__(self, arg,
86 attribute1 = 'C', values1 = [0.1, 1, 10, 100, 1000],
87 attribute2 = 'kernel.gamma', values2 = [0.001, 0.01, 0.1, 1, 10]) :
88
89 """
90 :Parameters:
91 - `arg` - another Param object, or the classifier to be used
92 - `attribute1` - the first attribute of the classifier that needs tuning
93 - `values1` - a list of values to try for attribute1
94 - `attribute2` - the second attribute
95 - `values2` - a list of values to try for attribute2
96
97 """
98
99
100 if arg.__class__ == self.__class__ :
101 other = arg
102 self.attribute1 = other.attribute1
103 self.values1 = other.values1[:]
104 self.attribute2 = other.attribute2
105 self.values2 = other.values2[:]
106 self.classifiers = [classifier.__class__(classifier)
107 for classifier in other.classifiers]
108 elif hasattr(arg, 'type') and arg.type == 'classifier' :
109 self.attribute1 = attribute1
110 self.values1 = values1
111 self.attribute2 = attribute2
112 self.values2 = values2
113
114 self.classifiers = [arg.__class__(arg)
115 for i in range(len(values1) * len(values2))]
116
117 for i in range(len(self.values1)) :
118 for j in range(len(self.values2)) :
119 classifierID = i * len(self.values2) + j
120 misc.mysetattr(self.classifiers[classifierID],
121 self.attribute1,
122 self.values1[i])
123 misc.mysetattr(self.classifiers[classifierID],
124 self.attribute2,
125 self.values2[j])
126
127
129
130 rep = '<' + self.__class__.__name__ + ' instance>\n'
131 rep += 'classifier:\n'
132 rep += self.classifiers[0].__repr__()
133 rep += 'attribute1: %s\n' % self.attribute1
134 rep += 'values1:' + str(self.values1) + '\n'
135 rep += 'attribute2: %s\n' % self.attribute2
136 rep += 'values2:' + str(self.values2) + '\n'
137
138 return rep
139
140
142 """
143 A model selector decides on the best classifier parameters
144 using the param object it receives as input.
145 Parameters are chosen according to the success rate in CV (or success
146 on a dataset provided to the train method.
147
148 """
149
150 attributes = {'numFolds' : 5,
151 'measure' : 'balancedSuccessRate',
152 'foldsToPerform' : 5,}
153
155 """
156 :Parameters:
157 - `arg` - another ModelSelector or a Param object
158
159 :Keywords:
160 - `measure` - which measure of accuracy to use for selecting the
161 best classifier (default = 'balancedSuccessRate')
162 supported measures are: 'balancedSuccessRate', 'successRate',
163 'roc', 'roc50' (you can substitute any number instead of 50)
164 - `numFolds` - number of CV folds to use when performing model selection
165 - `foldsToPerform` - the number of folds to actually perform
166 """
167
168
169 Classifier.__init__(self, **args)
170
171 if arg.__class__ == self.__class__ :
172 self.param = arg.param.__class__(arg.param)
173 self.measure = arg.measure
174 self.numFolds = arg.numFolds
175 elif arg.__class__.__name__.find('Param') >= 0 :
176 self.param = arg.__class__(arg)
177 else :
178 raise ValueError, 'wrong type of input for ModelSelector'
179
180 self.classifier = None
181
183
184 rep = '<' + self.__class__.__name__ + ' instance>\n'
185 if self.classifier is not None :
186 rep += self.classifier.__repr__()
187 else :
188 rep += self.param.__repr__()
189
190 return rep
191
192
193 - def train(self, data, **args) :
194 """
195 :Keywords:
196 - `train` - boolean - whether to train the best classifier
197 (default: True)
198 """
199
200 Classifier.train(self, data, **args)
201
202 maxSuccessRate = 0
203 bestClassifier = None
204 classifierIdx = 0
205 args['numFolds'] = self.numFolds
206 args['foldsToPerform'] = self.foldsToPerform
207 for r in self.param.stratifiedCV(data, **args) :
208 successRate = getattr(r, self.measure)
209 if successRate > maxSuccessRate :
210 bestClassifier = classifierIdx
211 maxSuccessRate = successRate
212 classifierIdx += 1
213
214 self.log.maxSuccessRate = maxSuccessRate
215
216 self.classifier = self.param.classifiers[bestClassifier].__class__(
217 self.param.classifiers[bestClassifier])
218
219 if 'train' not in args or args['train'] is True :
220 self.classifier.train(data, **args)
221
222 self.classifier.log.trainingTime = self.getTrainingTime()
223 self.classifier.log.classifier = self.classifier.__class__(self.classifier)
224
225
226 - def save(self, fileHandle) :
227
228 self.classifier.save(fileHandle)
229
230
232 """
233 A model selector for searching for best parameters for an
234 SVM classifier with a Gaussian kernel
235 Its search strategy is as follows:
236 First optimize the width of the Gaussian (gamma) for a fixed (low)
237 value of C, and then optimize C.
238 """
239
240 attributes = {'C' : [0.01, 0.1, 1, 10, 100, 1000],
241 'gamma' : [0.001, 0.01, 0.1, 1, 10],
242 'Clow' : 10,
243 'numFolds' : 5,
244 'measure' : 'balancedSuccessRate'}
245
246 - def __init__(self, arg = None, **args) :
247 """
248 :Parameters:
249 - `arg` - another ModelSelector object
250
251 :Keywords:
252 - `C` - a list of values to try for C
253 - `gamma` - a list of value to try for gamma
254 - `measure` - which measure of accuracy to use for selecting the
255 best classifier (default = 'balancedSuccessRate')
256 supported measures are: 'balancedSuccessRate', 'successRate',
257 'roc', 'roc50' (you can substitute another number instead of 50)
258 - `numFolds` - number of CV folds to use when performing model selection
259 """
260
261 Classifier.__init__(self, arg, **args)
262
263 self.classifier = None
264
266
267 rep = '<' + self.__class__.__name__ + ' instance>\n'
268 if self.classifier is not None :
269 rep += self.classifier.__repr__()
270 rep += 'C: ' + str(self.C) + '\n'
271 rep += 'gamma: ' + str(self.gamma) + '\n'
272
273 return rep
274
275 - def train(self, data, **args) :
276 """
277 :Keywords:
278 - `train` - boolean - whether to train the best classifier
279 (default: True)
280 - `vdata` - data to use for testing instead of using cross-validation
281 (not implemented yet)
282 """
283 Classifier.train(self, data, **args)
284
285 import ker
286 kernel = ker.Gaussian()
287 gammaSelect = ModelSelector(Param(svm.SVM(kernel, C = self.Clow),
288 'kernel.gamma', self.gamma),
289 measure = self.measure,
290 numFolds = self.numFolds)
291 gammaSelect.train(data)
292
293 kernel = ker.Gaussian(gamma = gammaSelect.classifier.kernel.gamma)
294 cSelect = ModelSelector(Param(svm.SVM(kernel), 'C', self.C),
295 measure = self.measure,
296 numFolds = self.numFolds)
297 cSelect.train(data)
298
299 self.classifier = cSelect.classifier.__class__(cSelect.classifier)
300
301 if 'train' not in args or args['train'] is True :
302 self.classifier.train(data, **args)
303
304 self.classifier.log.trainingTime = self.getTrainingTime()
305 self.classifier.log.classifier = self.classifier.__class__(self.classifier)
306