1
2 from PyML.base.pymlObject import PyMLobject
3 from PyML.utils import misc
4 import numpy
5 import math
6
7 import random
8
9 -def pca(X, numcomp = None) :
10 '''returns the matrix X as represented in the numcomp leading principal
11 components
12 if numcomp is None, all principal components are returned'''
13
14 d = numpy.shape(X)[1]
15 if numcomp is None :
16 numcomp = d
17
18 [u,s,v] = numpy.linalg.svd(X)
19 v = numpy.transpose(v)
20 v = v[:,:numcomp]
21 print numpy.shape(X)
22 return numpy.matrixmultiply(X, v)
23
24
26 '''returns X - mean(X), where the mean is taken over the columns of X'''
27
28 m = numpy.mean(X)
29
30 n = numpy.shape(X)[0]
31
32 return X - numpy.resize(m, (n,len(m)))
33
34
38
39
41 '''returns (X - mean(X)) / std(X) '''
42
43 m = numpy.mean(X)
44 std = numpy.std(X)
45
46 n = numpy.shape(X)[0]
47
48 return (X - numpy.resize(m, (n,len(m))))/numpy.resize(std, (n,len(std)))
49
51 '''returns (X - mean(X)) / std(X) '''
52
53 return numpy.transpose(standardizeColumns(numpy.transpose(X)))
54
55
57 '''returns the numVariables variables with the highest variance'''
58
59 s = numpy.std(X)
60 I = numpy.argsort(s)
61
62 Xout = numpy.take(X, I[-numVariables:], 1)
63
64 return Xout
65
66
68 '''returns the Euclidean distance-squared matrix'''
69
70 K = numpy.matrixmultiply (X, numpy.transpose (X))
71 n = numpy.shape(K)[0]
72 D = numpy.zeros((n,n), numpy.float)
73
74 for i in range(1, n-1) :
75 for j in range(i+1, n) :
76 D[i,j] = K[i,i] - 2 * K[i,j] + K[j,j]
77 D[j,i] = D[i,j]
78
79 return D
80
81
83 '''return the 2-norm of a vector given as a list or numpy array'''
84
85 x = numpy.asarray(x)
86
87 return math.sqrt(numpy.sum(x*x))
88
89
91 '''normalize each row of X to unit vectors'''
92
93 (numRows, numCols) = numpy.shape(X)
94 Xnorm = numpy.zeros((numRows, numCols), numpy.float)
95
96 for i in range(numRows) :
97 Xnorm[i] = X[i] / norm2(X[i])
98
99 return Xnorm
100
102
104
105 if type(data) == type('') :
106 print 'file name:', data
107 data = datafunc.PyVectorDataSet(data, idColumn = 0, headerRow = True, hint = 'csv')
108
109 self.data = data
110 self.idDict = misc.list2dict(data.labels.patternID,
111 range(len(data)))
112
113 print numpy.shape(data.X)
114 self.mean = numpy.mean(data.X, 1)
115 self.std = std(data.X, 1)
116 eps = 1e-5
117 I = numpy.nonzero(numpy.less(self.std, eps))[0]
118 print 'num zeros:',len(I)
119 numpy.put(self.std, I, 1)
120
121 self.numCorrelations = 10000
122 correlations = numpy.zeros(self.numCorrelations, numpy.float)
123
124 for i in range(self.numCorrelations) :
125 i1 = random.randrange(0, len(data))
126 i2 = random.randrange(0, len(data))
127 correlations[i] = self._corrcoef(i1, i2)
128 self.meanCorrelation = numpy.mean(correlations)
129 self.numCorrelations = 1000
130
132
133 if id1 == id2 : return 1.0
134 if type(id1) == type(1) :
135 return self._corrcoef(id1, id2)
136 if id1 not in self.idDict and id2 not in self.idDict :
137 return self.meanCorrelation
138 if id1 in self.idDict and id2 in self.idDict :
139 return self._corrcoef(self.idDict[id1], self.idDict[id2])
140 else :
141
142 if id2 not in self.idDict :
143 id1,id2 = id2,id1
144 i2 = self.idDict[id2]
145 correlations = numpy.zeros(self.numCorrelations, numpy.float)
146 for i in range(self.numCorrelations) :
147 i1 = random.randrange(0, len(self.data))
148 correlations[i] = self._corrcoef(i1, i2)
149 return numpy.mean(correlations)
150
152
153 return numpy.dot(self.data.X[i1] - self.mean[i1],
154 self.data.X[i2] - self.mean[i2]) / \
155 (len(self.data.X[i1]) * self.std[i1] * self.std[i2])
156
157
159 '''compute the correlation between the rows of the matrix X
160 more space efficient than numpy version'''
161
162 (n,d) = numpy.shape(X)
163
164 m = numpy.mean(X, 1)
165 std = numpy.std(X, 1)
166
167 K = numpy.ones((n,n), numpy.float)
168
169 for i in range(0, n - 1) :
170 for j in range(i + 1, n) :
171 K[i][j] = numpy.dot(X[i] - m[i], X[j] - m[i]) / (d * std[i] * std[j])
172 K[j][i] = K[i][j]
173
174 return K
175
177 """std(m,axis=0) returns the standard deviation along the given
178 dimension of m. The result is unbiased with division by N-1.
179 If m is of integer type returns a floating point answer.
180 """
181 x = numpy.asarray(m)
182 n = float(x.shape[axis])
183 mx = numpy.asarray(numpy.mean(x,axis))
184 if axis < 0:
185 axis = len(x.shape) + axis
186 mx.shape = mx.shape[:axis] + (1,) + mx.shape[axis:]
187 x = x - mx
188 return numpy.sqrt(numpy.add.reduce(x*x,axis)/(n))
189
191
192 (n,d) = numpy.shape(X)
193
194 Xn = standardizeRows(X)
195
196 return numpy.dot(Xn, numpy.transpose(Xn)) / (d - 1)
197
199
200 (n,d) = numpy.shape(X)
201
202 m = numpy.mean(X, 1)
203 std = numpy.std(X, 1)
204
205
206 return numpy.dot(X[i] - m[i], X[j] - m[i]) / (d * std[i] * std[j])
207
208
210 """
211 class for performing feature normalization
212
213 For each feature the Standardizer subtracts the feature's mean
214 and divides by its standard deviation
215
216 this rescaling is composed of two operations:
217
218 1. ``centering`` -- subtract from a feature its mean value;
219 this is referred to as 'translation'; the translation attribute
220 gives the value with which to translate each feature
221 2. ``scaling`` -- divide a feature by a scale, e.g. its standard deviation;
222 the 'scale' attribute gives the value with which to scale each feature
223
224 the 'train' method of the class computes the translation and scaling
225 factors, and performs normalization of the training data
226 the 'test' method uses values computed on the training data to normalize
227 the test data.
228
229 **caveat:**
230 Beware of performing training multiple times on the same dataset:
231 if a dataset has already been standardized, re-standardization
232 will recompute mean and standard deviation, which will be approximately
233 0 and 1 for each feature; subsequent application on test data will
234 have no effect. Because of this an exception is raised if the user
235 attempts to re-train an already trained Rescale object.
236 """
237
238 attributes = {'translate' : True,
239 'rescale' : True,
240 'translation' : None,
241 'scale' : None}
242
246
247 - def train(self, data, *options, **args) :
248
249 if self.translation is not None or self.scale is not None :
250 raise ValueError, 'object already trained'
251 if self.translate :
252 self.translation = data.mean()
253 if self.rescale :
254 self.scale = numpy.array(data.std())
255
256
257 eps = 1e-5
258 I = numpy.nonzero(numpy.less(self.scale, eps))[0]
259 numpy.put(self.scale, I, 1)
260
261 for i in range(len(self.scale)) :
262 if self.scale[i] == 0 and self.scale[i] == 1 :
263 self.scale[i] = 1
264
265 self.preproc(data)
266
273
274 - def test(self, data, *options, **args) :
277