Package PyML :: Package containers :: Module sequenceData
[frames] | no frames]

Source Code for Module PyML.containers.sequenceData

  1   
  2  from PyML.containers.labels import Labels 
  3  from PyML.containers.ext import csequencedata 
  4  from PyML.containers.baseDatasets import WrapperDataSet 
  5  from PyML.containers import ker 
  6  from PyML.containers.vectorDatasets import SparseDataSet 
  7   
8 -class SequenceData (WrapperDataSet, csequencedata.SequenceData) :
9
10 - def __init__(self, arg=None, **args) :
11 12 self.container = csequencedata.SequenceData 13 WrapperDataSet.__init__(self) 14 # copy construction: 15 if arg.__class__ == self.__class__ : 16 self.copyConstruct(arg, **args) 17 self.attachStringKernel(arg.stringKernel, **args) 18 return 19 # construct from a file: 20 elif type(arg) == type('') : 21 self.constructFromFile(arg, **args) 22 # construct from list of strings: 23 elif (type(arg) == type([]) or type(arg) == numpy.ndarray) : 24 self.fromArray(arg, **args) 25 else: 26 raise ValueError, 'wrong type of arg' 27 28 self.attachStringKernel(**args) 29 30 if 'kernel' in args : 31 ker = args['kernel'] 32 del args['kernel'] 33 self.attachKernel(ker, **args) 34 else : 35 self.attachKernel('linear') 36 37 if 'labelsFile' in args : 38 self.attachLabels(Labels(args['labelsFile'], **args))
39 40
41 - def copy(self, other, patterns, deepcopy) :
42 43 if patterns is None : 44 patterns = range(len(other)) 45 self.container.__init__(self, other, patterns)
46
47 - def constructFromFile(self, fileName, **args) :
48 49 print 'reading from', fileName 50 from PyML.utils import fasta 51 52 headerHandler = fastaHeaderHandler 53 if 'headerHandler' in args : 54 headerHandler = args['headerHandler'] 55 numPatterns = fasta.fasta_count(fileName) 56 self.container.__init__(self, numPatterns) 57 58 patternIDs = [] 59 L = [] 60 for record in fasta.fasta_itr(fileName) : 61 self.addPattern(record.sequence) 62 patternID, label = headerHandler(record.header) 63 patternIDs.append(patternID) 64 if label is not None : 65 L.append(label) 66 67 self.attachLabels(Labels(L, patternID = patternIDs, **args))
68 69
70 - def fromArray(self, X, **args) :
71 72 self.container.__init__(self, len(X)) 73 for x in X : 74 self.addPattern(x)
75 76
77 - def __len__(self) :
78 79 return self.size()
80
81 - def save(self, fileName) :
82 83 fileHandle = open(fileName, 'w') 84 85 for seqid in range(len(self)) : 86 fileHandle.write('>' + self.labels.patternID[seqid] + '\n') 87 seq = self.getSequence(seqid) 88 fileHandle.write(seq + '\n')
89
90 - def attachStringKernel(self, stringKernel = None, **args) :
91 92 kerneltype = 'PositionalKmer' 93 if stringKernel is not None : 94 k = stringKernel.duplicate() 95 else : 96 if 'stringKernel' in args : 97 kerneltype = args['stringKernel'] 98 if kerneltype == 'PositionalKmer' : 99 k = ker.PositionalKmerDispatcher(**args) 100 else : 101 raise ValueError, 'unrecognized type of string kernel' 102 103 self.stringKernel = k
104
105 -def fastaHeaderHandler(header) :
106 107 return header.split()[0], None
108
109 -def generateSpectrum(sequences, k, addon = '') :
110 111 #if k2 is None : k2 = k1 + 1 112 kmerList = [] 113 for s in sequences : 114 kmers = {} 115 for i in range(len(s) - k + 1) : 116 kmer = s[i:i+k] + addon 117 if kmer not in kmers : 118 kmers[kmer] = 0 119 kmers[kmer] += 1.0 120 kmerList.append(kmers) 121 122 return kmerList
123 124
125 -def spectrumData(sequences, k1, k2=None, **args) :
126 127 addon = '' 128 if 'addon' in args : 129 addon = args['addon'] 130 normalize = True 131 if 'normalize' in args : 132 normalize = args['normalize'] 133 if k2 is None : k2 = k1 + 1 134 data = SparseDataSet(generateSpectrum(sequences, k1, addon)) 135 if normalize : 136 data.normalize(2) 137 for k in range(k1+1, k2) : 138 data2 = SparseDataSet(generateSpectrum(sequences, k, addon)) 139 if normalize : 140 data2.normalize(2) 141 data.addFeatures(data2) 142 143 return data
144