Package PyML :: Package containers :: Module parsers
[frames] | no frames]

Source Code for Module PyML.containers.parsers

  1   
  2  import os 
  3  from PyML.utils import misc,myio 
  4  from PyML.base.pymlObject import PyMLobject 
  5   
  6  __docformat__ = "restructuredtext en" 
  7   
8 -class Parser (PyMLobject) :
9 10 '''A parser class to read datasets from a file. 11 Each parser support the following interface: 12 Constructor - pass a file name / file handle and information on which 13 pattern/classes/features to read from the file 14 check - checks whether the file conforms to the format read by the parser 15 scan - scan the file and make the _address variable that lists the positions 16 in the file of all the patterns that need to be read 17 next - read the next pattern (after calling the __iter__ method) 18 ''' 19 20 commentChar = ['%', '#'] 21
22 - def __init__(self, file, **args) :
23 24 if type(file) == type('') : 25 if not os.path.exists(file) : 26 raise ValueError, "file does not exist at %s" % file 27 self._fileHandle = myio.myopen(file) 28 #self._fileHandle = open(file) 29 else : 30 self._fileHandle = file 31 32 if 'classes' in args : 33 self.classesToRead = args['classes'] 34 else : 35 self.classesToRead = [] 36 37 if 'patterns' in args : 38 self.patternsToRead = args['patterns'] 39 else : 40 self.patternsToRead = None 41 42 if 'features' in args : 43 self.featuresToRead = args['features'] 44 else : 45 self.featuresToRead = []
46
47 - def check(self) :
48 49 pass
50
51 - def scan(self) :
52 53 pass
54
55 - def __iter__(self) :
56 57 self._addressIterator = iter(self._address) 58 return self
59 60
61 - def __len__(self) :
62 '''how many patterns are read''' 63 64 return len(self._address)
65 66
67 - def next(self) :
68 69 pass
70
71 - def skipComments(self) :
72 73 pos = 0 74 line = self._fileHandle.readline() 75 while line[0] in self.commentChar : 76 pos += len(line) 77 line = self._fileHandle.readline() 78 79 return line, pos
80
81 -class SparseParser (Parser) :
82 83 '''A class for parsing sparse data''' 84
85 - def __init__(self, file, **args) :
86 87 Parser.__init__(self, file, **args) 88 self.sparsify = False 89 if 'sparsify' in args : 90 self.sparsify = args['sparsify']
91
92 - def check(self) :
93 94 self._fileHandle.seek(0) 95 line,pos = self.skipComments() 96 return len(line.split(':')) > 1
97
98 - def readLabels(self) :
99 100 self._fileHandle.seek(0) 101 patternID = None 102 L = [] 103 104 line,pos = self.skipComments() 105 # determine if the dataset has IDs : 106 patternIDflag = (line.find(",") != -1) 107 if patternIDflag : 108 patternID = [] 109 # make sure there are labels : 110 tokens = line.split(',')[-1].split() 111 if len(tokens) == 0 or tokens[0].find(':') >= 0 : 112 raise ValueError, "unlabeled data" 113 114 while line : 115 116 if patternIDflag: 117 (patID, line) = line.split(",") 118 patternID.append(patID) 119 L.append(line.split()[0]) 120 121 line = self._fileHandle.readline() 122 123 return L,patternID
124
125 - def scan(self) :
126 127 self._fileHandle.seek(0) 128 patternID = None 129 self._featureID = [] 130 131 address = [] 132 133 line, pos = self.skipComments() 134 135 # determine if the dataset has IDs : 136 patternIDflag = (line.find(",") != -1) 137 if patternIDflag : 138 patternID = [] 139 140 # determine if the dataset has labels or not : 141 tokens = line.split(',')[-1].split() 142 if len(tokens) == 0 or tokens[0].find(':') >= 0 : 143 L = None 144 labeledData = 0 145 firstToken = 0 146 else : 147 L = [] 148 labeledData = 1 149 firstToken = 1 150 151 self._numFeatures = 0 152 153 self.integerID = True 154 155 i = 0 156 featureDict = {} 157 foundIntegerID = False 158 while line : 159 nextPos = pos + len(line) 160 if patternIDflag: 161 (patID, line) = line.split(",") 162 163 tokens = line.split() 164 if labeledData : 165 label = tokens[0] 166 else : 167 label = None 168 if not foundIntegerID : 169 if labeledData : 170 t = tokens[1:] 171 else : 172 t = tokens 173 if len(t) > 0 : 174 foundIntegerID = True 175 for token in t : 176 try : 177 int(token.split(':')[0]) 178 except : 179 self.integerID = False 180 181 if (label in self.classesToRead or len(self.classesToRead) == 0) : 182 if labeledData : 183 L.append(label) 184 if patternIDflag : 185 patternID.append(patID) 186 address.append(pos) 187 188 pos = nextPos 189 line = self._fileHandle.readline() 190 i +=1 191 if i % 100 == 0 and i > 0 : 192 print 'scanned',i,'patterns' 193 194 self._featureDict = {} 195 self._featureDict2 = {} 196 self._featureKeyDict = {} 197 self._address = address 198 self._labeledData = labeledData 199 self._labels = L 200 self._patternIDflag = patternIDflag 201 self._patternID = patternID 202 self._firstToken = firstToken
203 204
205 - def __iter__(self) :
206 207 self._addressIterator = iter(self._address) 208 209 return self
210
211 - def next(self) :
212 213 address = self._addressIterator.next() 214 self._fileHandle.seek(address) 215 216 line = self._fileHandle.readline() 217 if self._patternIDflag: 218 (patID, line) = line.split(",") 219 220 tokens = line.split() 221 if self._labeledData : 222 label = tokens[0] 223 else : 224 label = None 225 226 x = {} 227 if len(tokens) > self._firstToken : # check if this is not a zero vector 228 for token in tokens[self._firstToken:] : 229 (featureID, featureVal) = token.split(":") 230 if self.integerID : 231 featureID = int(featureID) 232 233 uniqueHash = True 234 # handle the case where the hash function is not unique: 235 if (featureID in self._featureDict2 and 236 self._featureDict2[featureID] != featureID) : 237 uniqueHash = False 238 #XXX 239 for i in range(255) : 240 fid = featureID + '+' + chr(i) 241 if fid not in self._featureDict2 : 242 featureID = fid 243 uniqueHash = True 244 if not uniqueHash : 245 raise ValueError, 'non-unique hash' 246 247 if not self.integerID : 248 featureKey = hash(featureID) 249 else : 250 featureKey = featureID 251 self._featureDict[featureID] = featureKey 252 self._featureDict2[featureID] = featureID 253 self._featureKeyDict[featureKey] = 1 254 255 if float(featureVal) != 0.0 or not self.sparsify : 256 #x[self._featureDict[featureID]] = float(featureVal) 257 x[featureKey] = float(featureVal) 258 259 return x
260
261 - def postProcess(self) :
262 263 if len(self._featureDict.keys()) != len(misc.unique(self._featureDict.values())) : 264 print len(self._featureDict.keys()), len(misc.unique(self._featureDict.values())) 265 raise ValueError, 'non-unique hash' 266 267 featureKeyDict = {} 268 featureKey = self._featureDict.values() 269 featureKey.sort() 270 for i in range(len(featureKey)) : 271 featureKeyDict[featureKey[i]] = i 272 inverseFeatureDict = misc.invertDict(self._featureDict) 273 featureID = [str(inverseFeatureDict[key]) for key in featureKey] 274 275 return featureID, featureKey, featureKeyDict
276 277
278 -class CSVParser (Parser):
279 280 """A class for parsing delimited files""" 281 282 attributes = {'idColumn' : None, 283 'labelsColumn' : None, 284 'headerRow' : False} 285
286 - def __init__(self, file, **args) :
287 288 """ 289 :Keywords: 290 - `headerRow` - True/False depending on whether the file contains a 291 header row that provides feature IDs 292 - `idColumn` - set to 0 if the data has pattern IDs in the first column 293 - `labelsColumn` - possible values: if there are no patternIDs 294 it is either 0 or -1, and if there are patternIDs, 1 or -1 295 """ 296 297 Parser.__init__(self, file, **args) 298 PyMLobject.__init__(self, None, **args) 299 300 if self.labelsColumn == 1 : 301 self.idColumn = 0 302 if self.idColumn is None and self.labelsColumn is None : 303 self._first = 0 304 else : 305 self._first = max(self.idColumn, self.labelsColumn) + 1 306 print 'label at ', self.labelsColumn
307
308 - def check(self) :
309 """very loose checking of the format of the file: 310 if the first line does not contain a colon (":") it is assumed 311 to be in csv format 312 the delimiter is determined to be "," if the first line contains 313 at least one comma; otherwise a split on whitespaces is used. 314 """ 315 316 self._fileHandle.seek(0) 317 318 line,pos = self.skipComments() 319 if len(line.split('\t')) > 1 : 320 self.delim = '\t' 321 elif len(line.split(',')) > 1 : 322 self.delim = ',' 323 else : 324 self.delim = None 325 line,pos = self.skipHeader(line,pos) 326 print 'delimiter', self.delim 327 328 # a file that does not contain a ":" is assumed to be in 329 # CSV format 330 331 if len(line.split(':')) > 1 : return False 332 333 return True
334
335 - def skipHeader(self, line, pos) :
336 """ 337 check if the file has a first line that provides the feature IDs 338 """ 339 340 tokens = line[:-1].split(self.delim) 341 if self.labelsColumn == -1 : 342 self._last = len(tokens) - 1 343 else : 344 self._last = len(tokens) 345 346 if self.headerRow : 347 self._featureID = tokens[self._first:self._last] 348 pos += len(line) 349 line = self._fileHandle.readline() 350 351 352 return line, pos
353
354 - def readLabels(self) :
355 356 self._fileHandle.seek(0) 357 358 L = [] 359 patternID = [] 360 361 line,pos = self.skipComments() 362 line, pos = self.skipHeader(line, pos) 363 tokens = line[:-1].split(self.delim) 364 if self.labelsColumn is None : 365 if len(tokens) == 2 : 366 self.labelsColumn = 1 367 self.idColumn = 0 368 elif len(tokens) == 1 : 369 self.labelsColumn = 0 370 371 i = 1 372 while line : 373 tokens = line[:-1].split(self.delim) 374 if self.idColumn is not None : 375 patternID.append(tokens[self.idColumn]) 376 else : 377 patternID.append(str(i)) 378 if self.labelsColumn is not None : 379 L.append(tokens[self.labelsColumn]) 380 line = self._fileHandle.readline() 381 i =+ 1 382 383 return L,patternID
384 385
386 - def scan(self) :
387 388 self._fileHandle.seek(0) 389 self._featureID = None 390 address = [] 391 392 line,pos = self.skipComments() 393 line, pos = self.skipHeader(line, pos) 394 395 tokens = line.split(self.delim) 396 self._patternID = [] 397 398 dim = len(tokens) - (self.idColumn is not None) - \ 399 (self.labelsColumn is not None) 400 401 self._labels = None 402 if self.labelsColumn is not None : 403 self._labels = [] 404 405 i = 0 406 while line : 407 address.append(pos) 408 pos += len(line) 409 line = self._fileHandle.readline() 410 i +=1 411 if i % 1000 == 0 and i > 0 : 412 print 'scanned',i,'patterns' 413 414 self._address = address 415 if self._featureID is None : 416 self._featureID = [str(i) for i in range(dim)]
417 418
419 - def next(self) :
420 421 address = self._addressIterator.next() 422 self._fileHandle.seek(address) 423 424 line = self._fileHandle.readline() 425 tokens = line[:-1].split(self.delim) 426 x = [float(token) for token in tokens[self._first:self._last]] 427 if self.labelsColumn is not None : 428 self._labels.append(tokens[self.labelsColumn]) 429 if self.idColumn is not None : 430 self._patternID.append(tokens[self.idColumn]) 431 432 return x
433
434 - def postProcess(self) :
435 436 featureKey = [hash(id) for id in self._featureID] 437 featureKeyDict = {} 438 for i in range(len(featureKey)) : 439 featureKeyDict[featureKey[i]] = i 440 441 return self._featureID, featureKey, featureKeyDict
442
443 -def parserDispatcher(fileHandle, **args) :
444 445 if 'hint' in args : 446 hint = args['hint'] 447 if hint == 'sparse' : 448 return SparseParser(fileHandle, **args) 449 elif hint == 'csv' : 450 p = CSVParser(fileHandle, **args) 451 p.check() 452 #print 'returning a csv parser' 453 return p 454 455 p = SparseParser(fileHandle, **args) 456 if p.check() : 457 return p 458 459 p = CSVParser(fileHandle, **args) 460 if p.check() : 461 return p 462 463 raise ValueError, 'file does not match existing parsers'
464 465
466 -def test(fileName) :
467 468 p = SparseParser(fileName) 469 470 print 'p.check:',p.check() 471 472 p.scan()
473