Package Bio :: Package MEME :: Module Motif
[hide private]
[frames] | no frames]

Source Code for Module Bio.MEME.Motif

  1  # Copyright 2004 by Jason A. Hackney.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  from Bio import Seq 
  7  from Bio.Alphabet import IUPAC 
  8  from math import sqrt 
  9  import sys 
 10   
11 -class Motif:
12 """A generic motif class. 13 14 A Motif has information about the alphabet used in the motif, 15 the length of the motif, the motif name, the number of occurrences, 16 and the individual instances of the motif. Each of the instances 17 is an object of the Instance class. 18 19 Methods: 20 add_instance(instance): adds a new instance of the motif. 21 get_instance_by_name(name): returns an instance with the specified name. 22 """
23 - def __init__(self):
24 self.instances = [] 25 self.alphabet = None 26 self.length = 0 27 self.num_occurrences = 0 28 self.name = "" 29 self.consensus = "" 30 self.pssm = []
31
32 - def add_instance (self, instance):
33 if isinstance(instance, Instance): 34 if self.length: 35 instance._length(self.length) 36 if self.name: 37 instance._motifname(self.name) 38 self.instances.append(instance)
39
40 - def get_instance_by_name (self,name):
41 for i in set.instances: 42 if i.seqname == name: 43 return i 44 return None
45
46 - def _alphabet (self, alphabet):
47 if alphabet == IUPAC.unambiguous_dna or alphabet == IUPAC.protein or alphabet == IUPAC.ambiguous_dna: 48 self.alphabet = alphabet 49 else: 50 return -1
51
52 - def _length (self, length):
53 if type(length) == int: 54 self.length = length 55 else: 56 length = int(length) 57 self.length = length
58
59 - def _name (self, name):
60 self.name = name
61
62 - def _consensus (self, consensus):
63 if self.alphabet: 64 self.consensus = Seq.Seq(consensus, self.alphabet) 65 else: 66 self.consensus = consensus
67
68 - def _numoccurrences (self, number):
69 if type(number) == int: 70 self.num_occurrences = number 71 else: 72 number = int(number) 73 self.num_occurrences = number
74
75 - def make_pssm (self):
76 if self.alphabet == None: 77 raise ValueError, "Alphabet for motif has not been set" 78 moieties = '' 79 if self.alphabet == IUPAC.unambiguous_dna: 80 moieties = 'ACGT' 81 if self.alphabet == IUPAC.protein: 82 moieties = 'ACDEFGHIKLMNPQRSTVWY' 83 pssm = [] 84 if self.instances and self.instances[0].sequence: 85 for position in self.instances[0].sequence: 86 pos = [] 87 for m in moieties: 88 pos.append(0.0) 89 pssm.append(pos) 90 for instance in self.instances: 91 for position in range(self.length): 92 my_moiety = instance.sequence[position] 93 try: 94 moiety_index = moieties.index(my_moiety) 95 except ValueError: 96 moiety_index = 0 97 pssm[position][moiety_index] += 1.0 98 pssm = [[x/len(self.instances) for x in y] for y in pssm ] 99 pssm = [tuple(x) for x in pssm] 100 self.pssm = pssm 101 else: 102 self.pssm = None
103
104 - def make_consensus (self, minimum_frequency = 0.6):
105 if not self.pssm: 106 self.make_pssm() 107 consensus = '' 108 null_character = 'N' 109 moieties = 'ACGT' 110 if self.alphabet == IUPAC.protein: 111 null_character = 'X' 112 moieties = 'ACDEFGHIKLMNPQRSTVWY' 113 for position in self.pssm: 114 this_position = null_character 115 vals = zip(position,moieties) 116 good_values = filter(lambda x: x[0] >= minimum_frequency, vals) 117 if good_values: 118 letters = [str(x[1]) for x in good_values] 119 my_letter = '/'.join(letters) 120 else: 121 my_letter = null_character 122 consensus += my_letter 123 self.consensus = consensus
124
125 -class MEMEMotif (Motif):
126 """A subclass of Motif used in parsing MEME (and MAST) output. 127 128 This sublcass defines functions and data specific to MEME motifs. 129 This includes the evalue for a motif and the PSSM of the motif. 130 131 Methods: 132 add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values. 133 add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies 134 add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position. 135 compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif 136 """
137 - def __init__ (self):
138 Motif.__init__(self) 139 self.evalue = 0.0 140 self.pssm = [] 141 self.logodds = []
142
143 - def add_instance_from_values (self, name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = '+'):
144 inst = Instance() 145 inst._sequence(sequence) 146 inst._pvalue(pvalue) 147 inst._seqname(name) 148 inst._start(start) 149 inst._strand(strand) 150 if self.length: 151 inst._length(self.length) 152 if self.name: 153 inst._motifname(self.name) 154 self.instances.append(inst)
155
156 - def _evalue (self, evalue):
157 if type(evalue) == float: 158 self.evalue = evalue 159 else: 160 evalue = float(evalue) 161 self.evalue = evalue
162
163 - def add_to_pssm (self, thisposition):
164 self.pssm.append(thisposition)
165
166 - def add_to_logodds (self, thisposition):
167 self.logodds.append(thisposition)
168
169 - def compare_motifs (self, motif):
170 if isinstance(motif, MEMEMotif): 171 if not self.pssm: 172 raise ValueError, 'This motif does not have a PSSM' 173 if not motif.pssm: 174 raise ValueError, 'The other motif does not have a PSSM' 175 mylen = len(self.pssm) 176 yourlen = len(motif.pssm) 177 myr = None 178 if mylen == yourlen: 179 myr = _corr(self.pssm, motif.pssm) 180 elif mylen < yourlen: 181 diff = yourlen - mylen 182 for i in range(0,diff): 183 yourpssm = motif.pssm[i:mylen+i] 184 r = _corr(self.pssm, yourpssm) 185 if r > myr: 186 myr = r 187 else: 188 diff = mylen - yourlen 189 for i in range(0, diff): 190 mypssm = self.pssm[i:yourlen+i] 191 r = _corr(mypssm, motif.pssm) 192 if r > myr: 193 myr = r 194 return myr 195 else: 196 sys.stderr.write(str(m2)) 197 return None
198 199 200
201 -class Instance:
202 """A class describing the instances of a motif, and the data thereof. 203 """
204 - def __init__ (self):
205 self.sequence = None 206 self.sequence_name = "" 207 self.start = 0 208 self.pvalue = 1.0 209 self.strand = 0 210 self.length = 0 211 self.motif_name = ""
212
213 - def _sequence (self, seq):
214 self.sequence = seq
215
216 - def _seqname (self, name):
217 self.sequence_name = name
218
219 - def _motifname (self, name):
220 self.motif_name = name
221
222 - def _start (self,start):
223 start = int(start) 224 self.start = start
225
226 - def _pvalue (self,pval):
227 pval = float(pval) 228 self.pvalue = pval
229
230 - def _score (self, score):
231 score = float(score) 232 self.score = score
233
234 - def _strand (self, strand):
235 self.strand = strand
236
237 - def _length (self, length):
238 self.length = length
239 240
241 -def _corr (x,y):
242 sx = 0 243 sy = 0 244 sxx = 0 245 syy = 0 246 sxy = 0 247 #make the two-dimensional lists one dimensional 248 if type(x[0] == list): 249 x = reduce(lambda a,b: a+b, x) 250 if type(y[0] == list): 251 y = reduce(lambda a,b: a+b, y) 252 sq = lambda t: t*t 253 length = len(x) 254 for a,b in zip(x,y): 255 sx += a 256 sy += b 257 sxx += sq(a) 258 syy += sq(b) 259 sxy += a*b 260 s1 = sxy - sx * sy * 1.0/length 261 s2 = (sxx - sx * sy * 1.0/length) * (syy - sx * sy * 1.0/length) 262 r = s1 / sqrt (s2) 263 return r
264