Package PyML :: Package containers :: Module ker
[frames] | no frames]

Source Code for Module PyML.containers.ker

  1   
  2  import sys 
  3  import math 
  4  import os 
  5   
  6  from ext import ckernel 
  7  from ext import cstringkernel 
  8  from PyML.utils import misc 
  9  from PyML.base.pymlObject import PyMLobject 
 10  from ext.ckernel import NONE, COSINE, TANIMOTO, DICES 
 11   
 12  normalizationMethods = ['none', 'cosine', 'tanimoto', 'dices'] 
 13   
 14  __docformat__ = "restructuredtext en" 
 15   
 16  """functionality for dealing with kernels and kernel objects""" 
 17   
18 -class Kernel (object) :
19 """base class for kernel objects 20 21 each kernel class defines an ``eval`` function: 22 eval(self, datai, i, j, dataj = None) that evaluates the kernel between 23 patterns i and j of dataset ``datai``; if dataj is given then pattern j 24 is assumed to come from dataset ``dataj`` 25 26 """ 27 28 type = 'kernel' 29
30 - def __repr__(self) :
31 32 rep = '<' + self.__class__.__name__ + ' instance>\n' 33 34 return rep
35
36 - def dump(self) :
37 """ 38 returns a string that can be used to construct an equivalent object 39 """ 40 kstr = self.__module__ + '.' + self.__class__.__name__ + '(' + \ 41 self.constructionParams() + ')' 42 43 return kstr
44
45 - def constructionParams(self) :
46 47 raise NotImplementedError
48
49 - def eval(self, datai, i, j, dataj = None) :
50 51 """evaluate the kernel function between 52 patterns i and j of dataset ``datai``; if dataj is given then pattern j 53 is assumed to come from dataset ``dataj`` 54 """ 55 56 raise NotImplementedError
57 58
59 -class Linear (Kernel, ckernel.Linear) :
60 """A Linear kernel (dot product) 61 62 Construction: 63 k = Linear() 64 """ 65
66 - def __init__(self, arg = None, **args) :
67 68 if arg.__class__ == self.__class__ : 69 ckernel.Linear.__init__(self, arg) 70 else : 71 ckernel.Linear.__init__(self) 72 if 'normalization' in args : 73 self.normalization = args['normalization'] 74 else : 75 self.normalization = NONE
76
77 - def constructionParams(self) :
78 79 return ""
80
81 - def eval (self, datai, i, j, dataj = None) :
82 83 if dataj is None : dataj = datai 84 return ckernel.Linear.eval(self, 85 datai.castToBase(), i, j, dataj.castToBase())
86
87 -class Cosine (Kernel, ckernel.Cosine) :
88 """A Cosine kernel (dot product) 89 Construction: 90 k = Cosine() 91 """ 92
93 - def __init__(self, arg = None) :
94 95 if arg.__class__ == self.__class__ : 96 ckernel.Cosine.__init__(self, arg) 97 else : 98 ckernel.Cosine.__init__(self)
99
100 - def constructionParams(self) :
101 102 return ''
103
104 - def eval (self, datai, i, j, dataj = None) :
105 106 if dataj is None : dataj = datai 107 return ckernel.Cosine.eval(self, 108 datai.castToBase(), i, j, dataj.castToBase())
109 110
111 -class Polynomial (Kernel, ckernel.Polynomial) :
112 """ 113 A Polynomial kernel 114 K(x,y) = (x \dot y + additiveConst) ** degree 115 116 Construction: 117 k = Polynomial(degree, additiveConst) 118 119 Attributes: 120 additiveConst, degree - kernel parameters 121 """ 122 123 attributes = {'normalization' : NONE, 124 'degree' : 2, 125 'additiveConst' : 1.0} 126
127 - def __init__(self, arg = 2, **args):
128 129 if arg.__class__ == self.__class__ : 130 ckernel.Polynomial.__init__(self, arg) 131 else : 132 ckernel.Polynomial.__init__(self) 133 for attribute in self.attributes : 134 if attribute in args : 135 setattr(self, attribute, args[attribute]) 136 else : 137 setattr(self, attribute, self.attributes[attribute]) 138 if arg != 2 : 139 self.degree = arg
140
141 - def __repr__(self) :
142 143 rep = '<' + self.__class__.__name__ + ' instance>\n' 144 rep += 'degree : ' + str(self.degree) + '\n' 145 rep += 'affine coefficient : ' + str(self.additiveConst) 146 147 return rep
148
149 - def constructionParams(self) :
150 151 return 'degree = ' + str(self.degree) + ',' + \ 152 'additiveConst = ' + str(self.additiveConst)
153
154 - def eval (self, datai, i, j, dataj = None) :
155 156 if dataj is None : dataj = datai 157 return ckernel.Polynomial.eval(self, 158 datai.castToBase(), i, j, dataj.castToBase())
159
160 -class Gaussian (Kernel, ckernel.Gaussian) :
161 162 """ 163 A Gaussian (RBF) kernel 164 K(x,y) = exp( - gamma * ||x - y||**2 165 166 Construction: 167 k = Gaussian(gamma) 168 169 Attributes: 170 gamma - kernel width parameter 171 """ 172 173 attributes = {'normalization' : NONE, 174 'gamma' : 1.0,} 175
176 - def __init__(self, arg = 1.0, **args) :
177 178 if arg.__class__ == self.__class__ : 179 ckernel.Gaussian.__init__(self, arg) 180 else : 181 ckernel.Gaussian.__init__(self) 182 for attribute in self.attributes : 183 if attribute in args : 184 setattr(self, attribute, args[attribute]) 185 else : 186 setattr(self, attribute, self.attributes[attribute]) 187 if arg != 1.0 : 188 self.gamma = arg
189
190 - def __repr__(self) :
191 192 rep = '<' + self.__class__.__name__ + ' instance>\n' 193 rep += 'gamma : ' + str(self.gamma) 194 195 return rep
196
197 - def constructionParams(self) :
198 199 return 'gamma = ' + str(self.gamma)
200
201 - def eval (self, datai, i, j, dataj = None) :
202 203 if dataj is None : dataj = datai 204 return ckernel.Gaussian.eval(self, 205 datai.castToBase(), i, j, dataj.castToBase())
206 207
208 -def attachKernel(data, kernel = 'linear', **args) :
209 210 if 'normalization' in args : 211 if args['normalization'].lower() not in normalizationMethods : 212 raise ValueError, 'unrecognized normalization method' 213 args['normalization'] = normalizationMethods.index(args['normalization'].lower()) 214 if type(kernel) == type('') : 215 kernel = kernel.lower() 216 if kernel == 'linear' : 217 k = Linear(**args) 218 elif kernel == 'cosine' : 219 k = Cosine() 220 elif kernel == 'polynomial' or kernel == 'poly' : 221 k = Polynomial(**args) 222 elif kernel == 'rbf' or kernel == 'gaussian' : 223 k = Gaussian(**args) 224 else : 225 raise ValueError, 'unrecognized type of kernel' 226 elif hasattr(kernel, 'type') and kernel.type == 'dataset' : 227 other = kernel 228 k = other._kernel.__class__(other._kernel) 229 elif hasattr(kernel, 'type') and kernel.type == 'kernel' : 230 k = eval(kernel.__class__.__name__ + '(kernel)') 231 232 # destroy the data object's kernel if it has one: 233 if hasattr(data, '_kernel') : 234 data._kernel.thisown = True 235 # del data._kernel 236 data._kernel = k 237 data._kernel.thisown = False 238 data.setKernel(k.castToBase()) 239 kernelName = k.__class__.__name__ 240 if kernelName == 'Cosine' or kernelName == 'Gaussian' or k.normalization != NONE : 241 data.computeNorms()
242 243
244 -def kernel2file(data, fileName, **args) :
245 246 """compute a kernel matrix and save it to a file in tab delimited format 247 248 :Parameters: 249 - `data` - a dataset 250 - `fileName` - file name to save the kernel 251 252 :Keywords: 253 - `format` - the format in which to save the kernel: pyml or gist formats [default: 'gist'] 254 gist format has an additional header line that contains the ids. 255 """ 256 257 if fileName is None or fileName == '-' : 258 outfile = sys.stdout 259 fileName = 'stdout' 260 else : 261 outfile = open(fileName, 'w') 262 263 format = 'gist' 264 if 'format' in args : 265 format = args['format'] 266 import tempfile 267 tmpfile = tempfile.mktemp() 268 ckernel.kernel2file(data.castToBase(), tmpfile) 269 tmp = open(tmpfile) 270 outfile = open(fileName, 'w') 271 if format == 'gist' : 272 outfile.write(fileName + '\t') 273 outfile.write('\t'.join(data.labels.patternID) + '\n') 274 i = 0 275 for line in tmp : 276 if data.labels.patternID is not None : 277 outfile.write(data.labels.patternID[i]) 278 i += 1 279 outfile.write(line) 280 os.remove(tmpfile)
281
282 -def averageEntry(fileName, ignoreDiagonal = True, delim = None) :
283 284 s = 0 285 numEntries = 0 286 file = open(fileName) 287 i = 0 288 for line in file : 289 tokens = line.split(delim) 290 for token in tokens : 291 try : 292 val = float(token) 293 s += val 294 numEntries += 1 295 except : 296 pass 297 i += 1 298 return s / numEntries
299 300
301 -def PositionalKmerDispatcher(**args) :
302 """ 303 A string kernel inspired by Raetsch et al's weighted degree kernel 304 """ 305 306 values = {'mink' : 6, 307 'maxk' : 8, 308 'mismatches' : 1, 309 'mismatchProfile' : [0,0,1,1,1,1,2,2,3,3,3,3], 310 'maxShift' : 0, 311 'noShiftStart' : 0, 312 'noShiftEnd' : 0, 313 } 314 315 values.update(args) 316 #print values 317 if len(values['mismatchProfile']) < values['maxk'] and values['mismatches'] > 0 : 318 raise ValueError, 'mismatchProfile not long enough' 319 320 # if no mismatches are allowed the mismatch profile needs to be all 0 321 if values['mismatches'] == 0 : 322 values['mismatchProfile'] = [0 for i in range(values['mink'], 323 values['maxk'] + 1) ] 324 325 return cstringkernel.PositionalKmer(values['mink'], 326 values['maxk'], 327 values['mismatches'], 328 values['mismatchProfile'], 329 values['maxShift'], 330 values['noShiftStart'], 331 values['noShiftEnd'])
332 333
334 -def combineKernels(ker1file, ker2file, kerOutFile, operation = 'add', **args) :
335 """combine two kernels by either adding or multiplying them. 336 In the case of addition the resulting kernel is of the form: 337 K_out(i,j) = weight * K1(i,j) + (1-weight) * K2(i,j) 338 where the default weight is 0.5 339 In the case of multiplication the resulting kernel is: 340 K_out(i,j) = (const1 + K1(i,j)) * (const2 + K2(i, j)) 341 where const1 and const2 are 0 by default. 342 343 Notes: It is assumed that the kernels have the same size and the ids 344 are in the same order (an exception is raised if this is not satisfied). 345 346 :Parameters: 347 - `operation` - which operation to perform between the kernels; it is 348 a string with supported values 'add' or 'multiply' (add by default) 349 350 :Keywords: 351 - `weight` - weighting of kernels for kernel addition 352 - `const1,const2` - additive factor in case of kernel multiplication 353 """ 354 355 weight = 0.5 356 if 'weight' in args : 357 weight = args['weight'] 358 const1 = 0 359 if 'const1' in args : 360 const1 = args['const1'] 361 const2 = 0 362 if 'const2' in args : 363 const2 = args['const2'] 364 import misc 365 delim1 = misc.getDelim(ker1file) 366 delim2 = misc.getDelim(ker2file) 367 ker1 = open(ker1file) 368 ker2 = open(ker2file) 369 kerOut = open(kerOutFile, 'w') 370 371 # check if kernel is in gist format 372 line1 = ker1.readline() 373 try : 374 float(line1.split(delim1)[-1]) 375 except : 376 line1 = ker1.readline() 377 line2 = ker2.readline() 378 try : 379 float(line2.split(delim2)[-1]) 380 except : 381 line2 = ker2.readline() 382 383 # check if there's a pattern id: 384 firstToken = 0 385 try : 386 float(tokens1[0]) 387 except : 388 firstToken = 1 389 390 while len(line1) > 0 : 391 tokens1 = line1.split(delim1) 392 tokens2 = line2.split(delim2) 393 if firstToken > 0 : 394 if tokens1[0] != tokens2[0] : 395 print tokens1[0], tokens2[0] 396 raise ValueError, 'kernels do not have the same ids' 397 kerOut.write(tokens1[0] + delim1) 398 if operation == 'add' : 399 outTokens = [str(float(tokens1[i]) * weight + 400 float(tokens2[i]) * (1-weight)) 401 for i in range(firstToken, len(tokens1))] 402 else : 403 outTokens = [str((const1 + float(tokens1[i])) * 404 (const2 + float(tokens2[i]))) 405 for i in range(firstToken, len(tokens1))] 406 kerOut.write(delim1.join(outTokens) + '\n') 407 line1 = ker1.readline() 408 line2 = ker2.readline()
409
410 -def sortKernel(kernelInFile, kernelOutFile, format = 'gist', **args) :
411 """ 412 sort a kernel matrix according to its pattern ID 413 414 :Parameters: 415 - `kernelInFile` - the kernel input file name 416 - `kernelOutFile` - the output file name 417 - `format` - whether to output the kernel in gist format 418 419 :Keywords: 420 - `delim` - the field delimiter (default = tab) 421 """ 422 423 from PyML.containers import KernelData 424 kdata = KernelData(kernelInFile) 425 idDict = misc.list2dict(kdata.labels.patternID, range(len(kdata))) 426 ids = kdata.labels.patternID[:] 427 ids.sort() 428 delim = '\t' 429 if 'delim' in args : 430 delim = args['delim'] 431 kernelFile = open(kernelOutFile, 'w') 432 if format == 'gist' : 433 kernelFile.write(kernelOutFile + delim + delim.join(ids) + '\n') 434 435 for id1 in ids : 436 kernelFile.write(id1 + delim) 437 tokens = [str(kdata.kernel.eval(kdata, idDict[id1], idDict[id2])) 438 for id2 in ids] 439 kernelFile.write(delim.join(tokens) + '\n')
440 441
442 -def commonKernel(kernelFile1, kernelFile2, kernelOutFileName1, kernelOutFileName2) :
443 444 delim = ' ' 445 from datafunc import KernelData 446 import misc 447 kdata1 = KernelData(kernelFile1) 448 kdata2 = KernelData(kernelFile2) 449 print 'loaded data' 450 ids = misc.intersect(kdata1.labels.patternID, kdata2.labels.patternID) 451 ids.sort() 452 idDict1 = misc.list2dict(ids) 453 454 if len(ids) != len(kdata1) : 455 kernelOutFile1 = open(kernelOutFileName1, 'w') 456 idDict = {} 457 for i in range(len(kdata1)) : 458 if kdata1.labels.patternID[i] in idDict1 : 459 idDict[kdata1.labels.patternID[i]] = i 460 for id1 in ids : 461 print id1 462 kernelOutFile1.write(id1 + delim) 463 tokens = [str(kdata1.kernel.eval(kdata1, idDict[id1], idDict[id2])) 464 for id2 in ids] 465 kernelOutFile1.write(delim.join(tokens) + '\n') 466 467 if len(ids) != len(kdata2) : 468 kernelOutFile2 = open(kernelOutFileName2, 'w') 469 idDict = {} 470 for i in range(len(kdata2)) : 471 if kdata2.labels.patternID[i] in idDict1 : 472 idDict[kdata2.labels.patternID[i]] = i 473 for id1 in ids : 474 print id1 475 kernelOutFile2.write(id1 + delim) 476 tokens = [str(kdata2.kernel.eval(kdata2, idDict[id1], idDict[id2])) 477 for id2 in ids] 478 kernelOutFile2.write(delim.join(tokens) + '\n')
479 480
481 -def expandKernel(inKernelFile, referenceKernelFile, outKernelFile, **args) :
482 483 """ 484 Given a kernel matrix that might have missing entries, fill those as 0 485 on the basis of the patterns in a reference kernel (it is checked that 486 the reference kernel is sorted). 487 488 :Parameters: 489 - `inKernelFile` - input kernel file name 490 - `referenceKernelFile` - file name for the reference kernel 491 - `outKernelFile` - file name to output expanded kernel 492 """ 493 494 if 'format' in args : 495 format = args['format'] 496 else : 497 format = 'gist' 498 delim = '\t' 499 500 from datafunc import KernelData 501 import misc 502 import numpy 503 504 inKernel = KernelData(inKernelFile) 505 refKernel = KernelData(referenceKernelFile) 506 print 'loaded data' 507 ids = refKernel.labels.patternID[:] 508 ids.sort() 509 if ids != refKernel.labels.patternID : 510 raise ValueError, 'reference kernel not sorted' 511 512 idDict = misc.list2dict(inKernel.labels.patternID) 513 outKernel = open(outKernelFile, 'w') 514 if format == 'gist' : 515 outKernel.write(outKernelFile + delim) 516 outKernel.write(delim.join(ids) + '\n') 517 518 for i in range(len(refKernel)) : 519 outKernel.write(id1 + delim) 520 for j in range(len(refKernel)) : 521 values = numpy.zeros(len(refKernel), numpy.float_) 522 if ids[i] in idDict and ids[j] in idDict : 523 values[j] = inKernel.kernel.eval(inKernel, 524 idDict[ids[i]],idDict[ids[j]]) 525 tokens = [str(value) for value in values] 526 outKernel.write(delim.join(tokens) + '\n')
527
528 -def showKernel(dataOrMatrix, fileName = None, useLabels = True, **args) :
529 530 labels = None 531 if hasattr(dataOrMatrix, 'type') and dataOrMatrix.type == 'dataset' : 532 data = dataOrMatrix 533 k = data.getKernelMatrix() 534 labels = data.labels 535 else : 536 k = dataOrMatrix 537 if 'labels' in args : 538 labels = args['labels'] 539 540 import matplotlib 541 542 if fileName is not None and fileName.find('.eps') > 0 : 543 matplotlib.use('PS') 544 from matplotlib import pylab 545 546 pylab.matshow(k) 547 #pylab.show() 548 549 if useLabels and labels.L is not None : 550 numPatterns = 0 551 for i in range(labels.numClasses) : 552 numPatterns += labels.classSize[i] 553 #pylab.figtext(0.05, float(numPatterns) / len(labels), labels.classLabels[i]) 554 #pylab.figtext(float(numPatterns) / len(labels), 0.05, labels.classLabels[i]) 555 pylab.axhline(numPatterns, color = 'black', linewidth = 1) 556 pylab.axvline(numPatterns, color = 'black', linewidth = 1) 557 pylab.axis([0, len(labels), 0, len(labels)]) 558 if fileName is not None : 559 pylab.savefig(fileName) 560 pylab.close()
561 562
563 -def sortKernel2(kernelInFile, kernelOutFile, ids, format = 'gist', **args) :
564 """ 565 sort a kernel matrix according to the given list of ids 566 567 :Parameters: 568 - `kernelInFile` - the kernel input file name 569 - `kernelOutFile` - the output file name 570 - `format` - whether to output the kernel in gist format 571 572 :Keywords: 573 - `delim` - the field delimiter (default = tab) 574 """ 575 576 from PyML.containers import KernelData 577 kdata = KernelData(kernelInFile) 578 K = kdata.getKernelMatrix() 579 idDict = misc.list2dict(ids, range(len(ids))) 580 581 delim = '\t' 582 if 'delim' in args : 583 delim = args['delim'] 584 kernelFile = open(kernelOutFile, 'w') 585 if format == 'gist' : 586 kernelFile.write(kernelOutFile + delim + delim.join(ids) + '\n') 587 588 for id1 in ids : 589 kernelFile.write(id1 + delim) 590 tokens = [str(K[idDict[id1]][idDict[id2]]) for id2 in ids] 591 kernelFile.write(delim.join(tokens) + '\n')
592