1
2
3
4
5 """
6 Implementation of sequence motifs.
7
8 Changes:
9 10.2007 - BW added matrix (vertical, horizontal) input, jaspar, transfac-like output
10 26.08.2007 - added a background attribute (Bartek Wilczynski)
11 26.08.2007 - added a DPQ measure (Bartek Wilczynski)
12 9.2007 (BW) : added the .to_fasta() and .weblogo() methods allowing to use the Berkeley weblogo server at http://weblogo.berkeley.edu/
13 """
14 from Bio.Seq import Seq
15 from Bio.SubsMat import FreqTable
16 from Bio.Alphabet import IUPAC
17 import math,random
18
20 """
21 A class representing sequence motifs.
22 """
24 self.instances = []
25 self.has_instances=False
26 self.counts = {}
27 self.has_counts=False
28 self.mask = []
29 self._pwm_is_current = False
30 self._pwm = []
31 self._log_odds_is_current = False
32 self._log_odds = []
33 self.alphabet=alphabet
34 self.length=None
35 self.background=dict(map(lambda n: (n,1.0/len(self.alphabet.letters)), self.alphabet.letters))
36 self.beta=1.0
37 self.info=None
38 self.name=""
39
41 if self.length==None:
42 self.length = len
43 elif self.length != len:
44 print "len",self.length,self.instances
45 raise ValueError("You can't change the length of the motif")
46
52
70
71
73 """
74 sets the mask for the motif
75
76 The mask should be a string containing asterisks in the position of significant columns and spaces in other columns
77 """
78 self._check_length(len(mask))
79 self.mask=[]
80 for char in mask:
81 if char=="*":
82 self.mask.append(1)
83 elif char==" ":
84 self.mask.append(0)
85 else:
86 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char)
87
89 """
90 returns the PWM computed for the set of instances
91
92 if laplace=True (default), pseudocounts equal to self.background multiplied by self.beta are added to all positions.
93 """
94
95 if self._pwm_is_current:
96 return self._pwm
97
98 self._pwm = []
99 for i in xrange(self.length):
100 dict = {}
101
102 for letter in self.alphabet.letters:
103 if laplace:
104 dict[letter]=self.beta*self.background[letter]
105 else:
106 dict[letter]=0.0
107 if self.has_counts:
108
109 for letter in self.alphabet.letters:
110 dict[letter]+=self.counts[letter][i]
111 elif self.has_instances:
112
113 for seq in self.instances:
114
115 dict[seq[i]]+=1
116 self._pwm.append(FreqTable.FreqTable(dict,FreqTable.COUNT,self.alphabet))
117 self._pwm_is_current=1
118 return self._pwm
119
121 """
122 returns the logg odds matrix computed for the set of instances
123 """
124
125 if self._log_odds_is_current:
126 return self._log_odds
127
128 self._log_odds = []
129 pwm=self.pwm(laplace)
130 for i in xrange(self.length):
131 d = {}
132 for a in self.alphabet.letters:
133 d[a]=math.log(pwm[i][a]/self.background[a],2)
134 self._log_odds.append(d)
135 self._log_odds_is_current=1
136 return self._log_odds
137
139 """Method returning the information content of a motif.
140 """
141 res=0
142 pwm=self.pwm()
143 for i in range(self.length):
144 res+=2
145 for a in self.alphabet.letters:
146 if pwm[i][a]!=0:
147 res+=pwm[i][a]*math.log(pwm[i][a],2)
148 return res
149
151 """
152 Computes expected score of motif's instance and its standard deviation
153 """
154 exs=0.0
155 var=0.0
156 pwm=self.pwm()
157 for i in range(self.length):
158 ex1=0.0
159 ex2=0.0
160 for a in self.alphabet.letters:
161 if pwm[i][a]!=0:
162 ex1+=pwm[i][a]*(math.log(pwm[i][a],2)-math.log(self.background[a],2))
163 ex2+=pwm[i][a]*(math.log(pwm[i][a],2)-math.log(self.background[a],2))**2
164 exs+=ex1
165 var+=ex2-ex1**2
166 if st_dev:
167 return exs,math.sqrt(var)
168 else:
169 return exs
170
172 """
173 a generator function, returning found positions of instances of the motif in a given sequence
174 """
175 if not self.has_instances:
176 raise ValueError ("This motif has no instances")
177 for pos in xrange(0,len(sequence)-self.length+1):
178 for instance in self.instances:
179 if instance.tostring()==sequence[pos:pos+self.length].tostring():
180 yield(pos,instance)
181 break
182
183 - def score_hit(self,sequence,position,normalized=0,masked=0):
184 """
185 give the pwm score for a given position
186 """
187 lo=self.log_odds()
188 score = 0.0
189 for pos in xrange(self.length):
190 a = sequence[position+pos]
191 if not masked or self.mask[pos]:
192 try:
193 score += lo[pos][a]
194 except:
195 pass
196 if normalized:
197 if not masked:
198 score/=self.length
199 else:
200 score/=len(filter(lambda x: x, self.mask))
201 return score
202
203 - def search_pwm(self,sequence,normalized=0,masked=0,threshold=0.0,both=True):
204 """
205 a generator function, returning found hits in a given sequence with the pwm score higher than the threshold
206 """
207 if both:
208 rc = self.reverse_complement()
209
210 sequence=sequence.tostring().upper()
211 for pos in xrange(0,len(sequence)-self.length+1):
212 score = self.score_hit(sequence,pos,normalized,masked)
213 if score > threshold:
214 yield (pos,score)
215 if both:
216 rev_score = rc.score_hit(sequence,pos,normalized,masked)
217 if rev_score > threshold:
218 yield (-pos,rev_score)
219
221 """
222 return the similarity score based on pearson correlation for the given motif against self.
223
224 We use the Pearson's correlation of the respective probabilities.
225 """
226
227 if self.alphabet != motif.alphabet:
228 raise ValueError("Cannot compare motifs with different alphabets")
229
230 max_p=-2
231 for offset in range(-self.length+1,motif.length):
232 if offset<0:
233 p = self.dist_pearson_at(motif,-offset)
234 else:
235 p = motif.dist_pearson_at(self,offset)
236
237 if max_p<p:
238 max_p=p
239 max_o=-offset
240 return 1-max_p,max_o
241
264
266 """
267 A similarity measure taking into account a product probability of generating overlaping instances of two motifs
268 """
269 max_p=0.0
270 for offset in range(-self.length+1,other.length):
271 if offset<0:
272 p = self.dist_product_at(other,-offset)
273 else:
274 p = other.dist_product_at(self,offset)
275 if max_p<p:
276 max_p=p
277 max_o=-offset
278 return 1-max_p/self.dist_product_at(self,0),max_o
279
281 s=0
282 for i in range(max(self.length,offset+other.length)):
283 f1=self[i]
284 f2=other[i-offset]
285 for n,b in self.background.items():
286 s+=b*f1[n]*f2[n]
287 return s/i
288
290 r"""Calculates the DPQ distance measure between motifs.
291
292 It is calculated as a maximal value of DPQ formula (shown using LaTeX
293 markup, familiar to mathematicians):
294
295 \sqrt{\sum_{i=1}^{alignment.len()} \sum_{k=1}^alphabet.len() \
296 \{ m1[i].freq(alphabet[k])*log_2(m1[i].freq(alphabet[k])/m2[i].freq(alphabet[k])) +
297 m2[i].freq(alphabet[k])*log_2(m2[i].freq(alphabet[k])/m1[i].freq(alphabet[k]))
298 }
299
300 over possible non-spaced alignemts of two motifs. See this reference:
301
302 D. M Endres and J. E Schindelin, "A new metric for probability
303 distributions", IEEE transactions on Information Theory 49, no. 7
304 (July 2003): 1858-1860.
305 """
306
307 min_d=float("inf")
308 min_o=-1
309 d_s=[]
310 for offset in range(-self.length+1,other.length):
311
312 if offset<0:
313 d = self.dist_dpq_at(other,-offset)
314 overlap = self.length+offset
315 else:
316 d = other.dist_dpq_at(self,offset)
317 overlap = other.length-offset
318 overlap = min(self.length,other.length,overlap)
319 out = self.length+other.length-2*overlap
320
321
322 d = (d/(out+overlap))*(2*overlap+out)/(2*overlap)
323
324 d_s.append((offset,d))
325 if min_d> d:
326 min_d=d
327 min_o=-offset
328 return min_d,min_o
329
331 """
332 calculates the dist_dpq measure with a given offset.
333
334 offset should satisfy 0<=offset<=self.length
335 """
336 def dpq (f1,f2,alpha):
337 s=0
338 for n in alpha.letters:
339 avg=(f1[n]+f2[n])/2
340 s+=f1[n]*math.log(f1[n]/avg,2)+f2[n]*math.log(f2[n]/avg,2)
341 return math.sqrt(s)
342
343 s=0
344 for i in range(max(self.length,offset+other.length)):
345 f1=self[i]
346 f2=other[i-offset]
347 s+=dpq(f1,f2,self.alphabet)
348 return s
349
351 """Reads the motif from the stream (in AlignAce format).
352
353 the self.alphabet variable must be set beforehand.
354 If the last line contains asterisks it is used for setting mask
355 """
356
357 while 1:
358 ln = stream.readline()
359 if "*" in ln:
360 self.set_mask(ln.strip("\n\c"))
361 break
362 self.add_instance(Seq(ln.strip(),self.alphabet))
363
365 """ string representation of a motif.
366 """
367 str = ""
368 for inst in self.instances:
369 str = str + inst.tostring() + "\n"
370
371 if masked:
372 for i in xrange(self.length):
373 if self.mask[i]:
374 str = str + "*"
375 else:
376 str = str + " "
377 str = str + "\n"
378 return str
379
381 """return the length of a motif
382 """
383 if self.length==None:
384 return 0
385 else:
386 return self.length
387
389 """
390 writes the motif to the stream
391 """
392
393 stream.write(self.__str__())
394
395
396
398 """
399 FASTA representation of motif
400 """
401 if not self.has_instances:
402 self.make_instances_from_counts()
403 str = ""
404 for i,inst in enumerate(self.instances):
405 str = str + "> instance %d\n"%i + inst.tostring() + "\n"
406
407 return str
408
410 """
411 Gives the reverse complement of the motif
412 """
413 res = Motif()
414 if self.has_instances:
415 for i in self.instances:
416 res.add_instance(i.reverse_complement())
417 else:
418 res.has_counts=True
419 res.counts["A"]=self.counts["T"]
420 res.counts["T"]=self.counts["A"]
421 res.counts["G"]=self.counts["C"]
422 res.counts["C"]=self.counts["G"]
423 res.counts["A"].reverse()
424 res.counts["C"].reverse()
425 res.counts["G"].reverse()
426 res.counts["T"].reverse()
427 res.length=self.length
428 res.mask = self.mask
429 return res
430
431
433 """
434 reads the motif from Jaspar .pfm file
435
436 The instances are fake, but the pwm is accurate.
437 """
438 return self._from_horiz_matrix(stream,letters="ACGT",make_instances=make_instances)
439
460
489
490
492 """Creates "fake" instances for a motif created from a count matrix.
493
494 In case the sums of counts are different for different columnes, the shorter columns are padded with background.
495 """
496 alpha="".join(self.alphabet.letters)
497
498 col=[]
499 self.has_instances=True
500 self.instances=[]
501 s = sum(map(lambda nuc: self.counts[nuc][0],self.alphabet.letters))
502 for i in range(self.length):
503 col.append("")
504 for n in self.alphabet.letters:
505 col[i] = col[i]+ (n*(self.counts[n][i]))
506 if len(col[i])<s:
507 print "WARNING, column too short",len(col[i]),s
508 col[i]+=(alpha*s)[:(s-len(col[i]))]
509
510
511 for i in range(s):
512 inst=""
513 for j in range(self.length):
514 inst+=col[j][i]
515
516 inst=Seq(inst,self.alphabet)
517 self.add_instance(inst)
518 return self.instances
519
521 """Creates the count matrix for a motif with instances.
522
523 """
524
525
526 counts={}
527 for a in self.alphabet.letters:
528 counts[a]=[]
529 self.has_counts=True
530 s = len(self.instances)
531 for i in range(self.length):
532 ci = dict(map(lambda a: (a,0),self.alphabet.letters))
533 for inst in self.instances:
534 ci[inst[i]]+=1
535 for a in self.alphabet.letters:
536 counts[a].append(ci[a])
537 self.counts=counts
538 return counts
539
541 """
542 reads the motif from Jaspar .sites file
543
544 The instances and pwm are OK.
545 """
546
547 while True:
548 ln = stream.readline()
549 if ln=="" or ln[0]!=">":
550 break
551
552 ln=stream.readline().strip()
553 i=0
554 while ln[i]==ln[i].lower():
555 i+=1
556 inst=""
557 while i<len(ln) and ln[i]==ln[i].upper():
558 inst+=ln[i]
559 i+=1
560 inst=Seq(inst,self.alphabet)
561 self.add_instance(inst)
562
563 self.set_mask("*"*len(inst))
564 return self
565
566
568 """Returns the probability distribution over symbols at a given position, padding with background.
569
570 If the requested index is out of bounds, the returned distribution comes from background.
571 """
572 if index in range(self.length):
573 return self.pwm()[index]
574 else:
575 return self.background
576
578 """Returns the consensus sequence of a motif.
579 """
580 res=""
581 for i in range(self.length):
582 max_f=0
583 max_n="X"
584 for n in self[i].keys():
585 if self[i][n]>max_f:
586 max_f=self[i][n]
587 max_n=n
588 res+=max_n
589 return Seq(res,self.alphabet)
590
592 """returns the least probable pattern to be generated from this motif.
593 """
594 res=""
595 for i in range(self.length):
596 min_f=10.0
597 min_n="X"
598 for n in self[i].keys():
599 if self[i][n]<min_f:
600 min_f=self[i][n]
601 min_n=n
602 res+=min_n
603 return Seq(res,self.alphabet)
604
606 """Maximal possible score for this motif.
607
608 returns the score computed for the consensus sequence.
609 """
610 return self.score_hit(self.consensus(),0)
611
613 """Minimal possible score for this motif.
614
615 returns the score computed for the anticonsensus sequence.
616 """
617 return self.score_hit(self.anticonsensus(),0)
618
619 - def weblogo(self,fname,format="PNG",**kwds):
620 """
621 uses the Berkeley weblogo service to download and save a weblogo of itself
622
623 requires an internet connection.
624 The parameters from **kwds are passed directly to the weblogo server.
625 """
626 import urllib
627 import urllib2
628 al= self._to_fasta()
629 url = 'http://weblogo.berkeley.edu/logo.cgi'
630 values = {'sequence' : al,
631 'format' : format,
632 'logowidth' : '18',
633 'logoheight' : '5',
634 'logounits' : 'cm',
635 'kind' : 'AUTO',
636 'firstnum' : "1",
637 'command' : 'Create Logo',
638 'smallsamplecorrection' : "on",
639 'symbolsperline' : 32,
640 'res' : '96',
641 'res_units' : 'ppi',
642 'antialias' : 'on',
643 'title' : '',
644 'barbits' : '',
645 'xaxis': 'on',
646 'xaxis_label' : '',
647 'yaxis': 'on',
648 'yaxis_label' : '',
649 'showends' : 'on',
650 'shrink' : '0.5',
651 'fineprint' : 'on',
652 'ticbits' : '1',
653 'colorscheme' : 'DEFAULT',
654 'color1' : 'green',
655 'color2' : 'blue',
656 'color3' : 'red',
657 'color4' : 'black',
658 'color5' : 'purple',
659 'color6' : 'orange',
660 'color1' : 'black',
661 }
662 for k,v in kwds.items():
663 values[k]=str(v)
664
665 data = urllib.urlencode(values)
666 req = urllib2.Request(url, data)
667 response = urllib2.urlopen(req)
668 f=open(fname,"w")
669 im=response.read()
670
671 f.write(im)
672 f.close()
673
674
676 """Write the representation of a motif in TRANSFAC format
677 """
678 res="XX\nTY Motif\n"
679 try:
680 res+="ID %s\n"%self.name
681 except:
682 pass
683 res+="BF undef\nP0"
684 for a in self.alphabet.letters:
685 res+=" %s"%a
686 res+="\n"
687 if not self.has_counts:
688 self.make_counts_from_instances()
689 for i in range(self.length):
690 if i<9:
691 res+="0%d"%(i+1)
692 else:
693 res+="%d"%(i+1)
694 for a in self.alphabet.letters:
695 res+=" %d"%self.counts[a][i]
696 res+="\n"
697 res+="XX\n"
698 return res
699
713
735
740
760