Package PyML :: Package containers :: Module vectorDatasets
[frames] | no frames]

Source Code for Module PyML.containers.vectorDatasets

  1   
  2  import numpy 
  3   
  4  from PyML.containers.baseDatasets import WrapperDataSet, BaseVectorDataSet 
  5  from PyML.utils import arrayWrap,misc 
  6  from ext import csparsedataset,cvectordataset 
  7   
8 -class BaseCVectorDataSet (WrapperDataSet, BaseVectorDataSet) :
9 """A base class for vector dataset containers implemented in C++""" 10
11 - def __init__(self) :
12 if self.__class__.__name__ == 'SparseDataSet' : 13 self.container = csparsedataset.SparseDataSet 14 elif self.__class__.__name__ == 'VectorDataSet' : 15 self.container = cvectordataset.VectorDataSet
16
17 - def copy(self, other, patterns, deepcopy) :
18 """ 19 copy a wrapper dataset 20 21 :Parameters: 22 - `other` - the other dataset 23 - `patternsToCopy` - a list of patterns to copy 24 - `deepcopy` - a 0/1 flag telling whether to do deepcopy or not 25 """ 26 27 if patterns is None : 28 patterns = range(len(other)) 29 self.container.__init__(self, other, patterns) 30 self.featureDict = other.featureDict.copy() 31 self.featureID = other.featureID[:]
32 33
34 - def initializeDataMatrix(self, numPatterns, numFeatures) :
35 36 self.container.__init__(self, numPatterns)
37 38
39 - def addPattern(self, x, i) :
40 41 if type(x) == type({}) : 42 keys,values = arrayWrap.dict2vectors(x) 43 elif type(x) == type(numpy.array(1)) or type(x) == type([]) : 44 keys = arrayWrap.longVector([]) 45 values = arrayWrap.doubleVector(x) 46 else: 47 raise TypeError,"data vectors must be dictionary, list or arrays" 48 self.container.addPattern(self, keys, values)
49
50 - def addFeature(self, id, values) :
51 52 if len(values) != self.size() : 53 raise ValueError, \ 54 'number of values provided does not match dataset size' 55 if type(id) == type(1) : 56 id = str(id) 57 hashID = hash(id) 58 if not hasattr(self, 'featureKeyDict') : 59 self.addFeatureKeyDict() 60 if hashID in self.featureKeyDict : 61 raise ValueError, 'Feature already exists, or hash clash' 62 if type(values) != type([]) : 63 values = [v for v in values] 64 65 self.container.addFeature(self, hashID, values) 66 self.updateFeatureDict(id)
67
68 - def addFeatures(self, other) :
69 70 if len(other) != len(self) : 71 raise ValueError, 'number of examples does not match' 72 if not hasattr(self, 'featureKeyDict') : 73 self.addFeatureKeyDict() 74 for id in other.featureID : 75 if hash(id) in self.featureKeyDict : 76 raise ValueError, 'Feature already exists, or hash clash' 77 78 self.container.addFeatures(self, other) 79 self.updateFeatureDict(other)
80 81
82 - def getPattern(self, i) :
83 84 if i < 0 or i >= len(self) : 85 raise ValueError, 'Index out of range' 86 return self.container.getPattern(self, i)
87
88 - def extendX(self, other, patterns) :
89 90 self.container.extend(self, other, patterns)
91
92 - def eliminateFeatures(self, featureList):
93 """eliminate a list of features from a dataset 94 INPUT: 95 featureList - a list of features to eliminate; these are numbers 96 between 0 and numFeatures-1 (indices of features, not their IDs)""" 97 98 if len(featureList) == 0 : return 99 if type(featureList[0]) == type('') : 100 featureList = self.featureNames2IDs(featureList) 101 featureList.sort() 102 if type(featureList) != type([]) : 103 featureList = list(featureList) 104 if max(featureList) >= self.numFeatures or min(featureList) < 0 : 105 raise ValueError, 'Bad feature list' 106 107 self.container.eliminateFeatures(self, featureList) 108 self.updateFeatureDict(featureList)
109
110 - def scale(self, w) :
111 """rescale the columns of the data matrix by a weight vector w: 112 set X[i][j] = X[i][j] * w[j] 113 """ 114 115 if type(w) == type(1.0) : 116 w = [w for i in range(self.numFeatures)] 117 if type(w) != type([]) : 118 w = list(w) 119 #numpy.ones(self.numFeatures, numpy.float_) * w 120 self.container.scale(self, w)
121
122 - def translate(self, c) :
123 124 if type(c) != type([]) : 125 c = list(c) 126 self.container.translate(self, c)
127
128 - def mean(self, patterns = None) :
129 130 if patterns is None : patterns = range(len(self)) 131 if type(patterns) != type([]) : patterns = list(patterns) 132 if min(patterns) < 0 or max(patterns) >= len(self) : 133 raise ValueError, 'Pattern index out of range' 134 return self.container.mean(self, patterns)
135
136 - def std(self, patterns = None) :
137 138 if patterns is None : patterns = range(len(self)) 139 if type(patterns) != type([]) : patterns = list(patterns) 140 if min(patterns) < 0 or max(patterns) >= len(self) : 141 raise ValueError, 'Pattern index out of range' 142 return self.container.standardDeviation(self, patterns)
143
144 - def featureCount(self, feature, patterns = None) :
145 146 if patterns is None : patterns = range(len(self)) 147 if type(patterns) != type([]) : patterns = list(patterns) 148 if min(patterns) < 0 or max(patterns) >= len(self) : 149 raise ValueError, 'Pattern index out of range' 150 return self.container.featureCount(self, feature, patterns)
151
152 - def featureCounts(self, patterns = None) :
153 154 if patterns is None : patterns = range(len(self)) 155 if type(patterns) != type([]) : patterns = list(patterns) 156 if min(patterns) < 0 or max(patterns) >= len(self) : 157 raise ValueError, 'Pattern index out of range' 158 return self.container.featureCounts(self, patterns)
159
160 - def nonzero(self, feature, patterns = None) :
161 162 if patterns is None : patterns = range(len(self)) 163 if type(patterns) != type([]) : patterns = list(patterns) 164 if min(patterns) < 0 or max(patterns) >= len(self) : 165 raise ValueError, 'Pattern index goes outside of range' 166 return self.container.nonzero(self, feature, patterns)
167
168 - def commonFeatures(self, pattern1, pattern2) :
169 170 return [self.featureKeyDict[featureKey] for featureKey in 171 self.container.commonFeatures(self, pattern1, pattern2)]
172
173 - def normalize(self, norm=2) :
174 175 norm = int(norm) 176 if norm not in [1,2] : 177 raise ValueError, 'bad value for norm' 178 self.container.normalize(self, norm)
179 180
181 -class VectorDataSet (BaseCVectorDataSet, cvectordataset.VectorDataSet) :
182
183 - def __init__(self, arg = None, **args):
184 BaseCVectorDataSet.__init__(self) 185 BaseVectorDataSet.__init__(self, arg, **args)
186
187 - def addPattern(self, x, i) :
188 189 if type(x) == type(numpy.array(1)) or type(x) == type([]) : 190 values = arrayWrap.doubleVector(x) 191 else: 192 raise TypeError, "data vectors must be list or array" 193 self.container.addPattern(self, values)
194 195
196 - def updateFeatureDict(self, arg = None) :
197 198 if arg.__class__ == self.__class__ : 199 # features were extended with those in another dataset 200 other = arg 201 self.featureID.extend(other.featureID) 202 elif type(arg) == type([]) : 203 #features were eliminated: 204 eliminated = misc.list2dict(arg) 205 self.featureID = [self.featureID[i] for i in range(len(self.featureID)) 206 if i not in eliminated] 207 elif type(arg) == type(1) or type(arg) == type('') : 208 # a feature was added 209 id = arg 210 self.featureID.append(id) 211 self.featureDict[id] = self.numFeatures - 1 212 return 213 214 self.featureDict = {} 215 for i in range(self.numFeatures) : 216 self.featureDict[self.featureID[i]] = i
217 218
219 -class SparseDataSet (BaseCVectorDataSet, csparsedataset.SparseDataSet) :
220
221 - def __init__(self, arg = None, **args):
222 BaseCVectorDataSet.__init__(self) 223 BaseVectorDataSet.__init__(self, arg, **args)
224
225 - def updateFeatureDict(self, arg = None) :
226 227 if arg.__class__ == self.__class__ : 228 other = arg 229 self.featureID.extend(other.featureID) 230 self.featureID.sort(cmp = lambda x,y : cmp(hash(x), hash(y))) 231 elif type(arg) == type([]) : 232 #features were eliminated: 233 eliminated = misc.list2dict(arg) 234 self.featureID = [self.featureID[i] for i in range(len(self.featureID)) 235 if i not in eliminated] 236 elif type(arg) == type(1) or type(arg) == type('') : 237 # a feature was added: 238 id = arg 239 self.featureID.append(id) 240 self.featureID.sort(cmp = lambda x,y : cmp(hash(x), hash(y))) 241 242 self.featureDict = {} 243 for i in range(len(self.featureID)) : 244 self.featureDict[self.featureID[i]] = i
245