Package Bio :: Package Motif :: Module _Motif
[hide private]
[frames] | no frames]

Source Code for Module Bio.Motif._Motif

  1  # Copyright 2003 by Bartek Wilczynski.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  Implementation of sequence motifs. 
  7   
  8  Changes: 
  9  10.2007 - BW added matrix (vertical, horizontal) input, jaspar, transfac-like output 
 10  26.08.2007 - added a background attribute  (Bartek Wilczynski) 
 11  26.08.2007 - added a DPQ measure   (Bartek Wilczynski) 
 12  9.2007 (BW) : added the .to_fasta() and .weblogo() methods allowing to use the Berkeley weblogo server at http://weblogo.berkeley.edu/ 
 13  """ 
 14  from Bio.Seq import Seq 
 15  from Bio.SubsMat import FreqTable 
 16  from Bio.Alphabet import IUPAC 
 17  import math,random 
 18   
19 -class Motif(object):
20 """ 21 A class representing sequence motifs. 22 """
23 - def __init__(self,alphabet=IUPAC.unambiguous_dna):
24 self.instances = [] 25 self.has_instances=False 26 self.counts = {} 27 self.has_counts=False 28 self.mask = [] 29 self._pwm_is_current = False 30 self._pwm = [] 31 self._log_odds_is_current = False 32 self._log_odds = [] 33 self.alphabet=alphabet 34 self.length=None 35 self.background=dict(map(lambda n: (n,1.0/len(self.alphabet.letters)), self.alphabet.letters)) 36 self.beta=1.0 37 self.info=None 38 self.name=""
39
40 - def _check_length(self, len):
41 if self.length==None: 42 self.length = len 43 elif self.length != len: 44 print "len",self.length,self.instances 45 raise ValueError("You can't change the length of the motif")
46
47 - def _check_alphabet(self,alphabet):
48 if self.alphabet==None: 49 self.alphabet=alphabet 50 elif self.alphabet != alphabet: 51 raise ValueError("Wrong Alphabet")
52
53 - def add_instance(self,instance):
54 """ 55 adds new instance to the motif 56 """ 57 self._check_alphabet(instance.alphabet) 58 self._check_length(len(instance)) 59 if self.has_counts: 60 for i in range(self.length): 61 let=instance[i] 62 self.counts[let][i]+=1 63 64 if self.has_instances or not self.has_counts: 65 self.instances.append(instance) 66 self.has_instances=True 67 68 self._pwm_is_current = False 69 self._log_odds_is_current = False
70 71
72 - def set_mask(self,mask):
73 """ 74 sets the mask for the motif 75 76 The mask should be a string containing asterisks in the position of significant columns and spaces in other columns 77 """ 78 self._check_length(len(mask)) 79 self.mask=[] 80 for char in mask: 81 if char=="*": 82 self.mask.append(1) 83 elif char==" ": 84 self.mask.append(0) 85 else: 86 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char)
87
88 - def pwm(self,laplace=True):
89 """ 90 returns the PWM computed for the set of instances 91 92 if laplace=True (default), pseudocounts equal to self.background multiplied by self.beta are added to all positions. 93 """ 94 95 if self._pwm_is_current: 96 return self._pwm 97 #we need to compute new pwm 98 self._pwm = [] 99 for i in xrange(self.length): 100 dict = {} 101 #filling the dict with 0's 102 for letter in self.alphabet.letters: 103 if laplace: 104 dict[letter]=self.beta*self.background[letter] 105 else: 106 dict[letter]=0.0 107 if self.has_counts: 108 #taking the raw counts 109 for letter in self.alphabet.letters: 110 dict[letter]+=self.counts[letter][i] 111 elif self.has_instances: 112 #counting the occurences of letters in instances 113 for seq in self.instances: 114 #dict[seq[i]]=dict[seq[i]]+1 115 dict[seq[i]]+=1 116 self._pwm.append(FreqTable.FreqTable(dict,FreqTable.COUNT,self.alphabet)) 117 self._pwm_is_current=1 118 return self._pwm
119
120 - def log_odds(self,laplace=True):
121 """ 122 returns the logg odds matrix computed for the set of instances 123 """ 124 125 if self._log_odds_is_current: 126 return self._log_odds 127 #we need to compute new pwm 128 self._log_odds = [] 129 pwm=self.pwm(laplace) 130 for i in xrange(self.length): 131 d = {} 132 for a in self.alphabet.letters: 133 d[a]=math.log(pwm[i][a]/self.background[a],2) 134 self._log_odds.append(d) 135 self._log_odds_is_current=1 136 return self._log_odds
137
138 - def ic(self):
139 """Method returning the information content of a motif. 140 """ 141 res=0 142 pwm=self.pwm() 143 for i in range(self.length): 144 res+=2 145 for a in self.alphabet.letters: 146 if pwm[i][a]!=0: 147 res+=pwm[i][a]*math.log(pwm[i][a],2) 148 return res
149
150 - def exp_score(self,st_dev=False):
151 """ 152 Computes expected score of motif's instance and its standard deviation 153 """ 154 exs=0.0 155 var=0.0 156 pwm=self.pwm() 157 for i in range(self.length): 158 ex1=0.0 159 ex2=0.0 160 for a in self.alphabet.letters: 161 if pwm[i][a]!=0: 162 ex1+=pwm[i][a]*(math.log(pwm[i][a],2)-math.log(self.background[a],2)) 163 ex2+=pwm[i][a]*(math.log(pwm[i][a],2)-math.log(self.background[a],2))**2 164 exs+=ex1 165 var+=ex2-ex1**2 166 if st_dev: 167 return exs,math.sqrt(var) 168 else: 169 return exs
170
171 - def search_instances(self,sequence):
172 """ 173 a generator function, returning found positions of instances of the motif in a given sequence 174 """ 175 if not self.has_instances: 176 raise ValueError ("This motif has no instances") 177 for pos in xrange(0,len(sequence)-self.length+1): 178 for instance in self.instances: 179 if instance.tostring()==sequence[pos:pos+self.length].tostring(): 180 yield(pos,instance) 181 break # no other instance will fit (we don't want to return multiple hits)
182
183 - def score_hit(self,sequence,position,normalized=0,masked=0):
184 """ 185 give the pwm score for a given position 186 """ 187 lo=self.log_odds() 188 score = 0.0 189 for pos in xrange(self.length): 190 a = sequence[position+pos] 191 if not masked or self.mask[pos]: 192 try: 193 score += lo[pos][a] 194 except: 195 pass 196 if normalized: 197 if not masked: 198 score/=self.length 199 else: 200 score/=len(filter(lambda x: x, self.mask)) 201 return score
202
203 - def search_pwm(self,sequence,normalized=0,masked=0,threshold=0.0,both=True):
204 """ 205 a generator function, returning found hits in a given sequence with the pwm score higher than the threshold 206 """ 207 if both: 208 rc = self.reverse_complement() 209 210 sequence=sequence.tostring().upper() 211 for pos in xrange(0,len(sequence)-self.length+1): 212 score = self.score_hit(sequence,pos,normalized,masked) 213 if score > threshold: 214 yield (pos,score) 215 if both: 216 rev_score = rc.score_hit(sequence,pos,normalized,masked) 217 if rev_score > threshold: 218 yield (-pos,rev_score)
219
220 - def dist_pearson(self, motif, masked = 0):
221 """ 222 return the similarity score based on pearson correlation for the given motif against self. 223 224 We use the Pearson's correlation of the respective probabilities. 225 """ 226 227 if self.alphabet != motif.alphabet: 228 raise ValueError("Cannot compare motifs with different alphabets") 229 230 max_p=-2 231 for offset in range(-self.length+1,motif.length): 232 if offset<0: 233 p = self.dist_pearson_at(motif,-offset) 234 else: #offset>=0 235 p = motif.dist_pearson_at(self,offset) 236 237 if max_p<p: 238 max_p=p 239 max_o=-offset 240 return 1-max_p,max_o
241
242 - def dist_pearson_at(self,motif,offset):
243 sxx = 0 # \sum x^2 244 sxy = 0 # \sum x \cdot y 245 sx = 0 # \sum x 246 sy = 0 # \sum y 247 syy = 0 # \sum x^2 248 norm=max(self.length,offset+motif.length) 249 250 for pos in range(max(self.length,offset+motif.length)): 251 for l in self.alphabet.letters: 252 xi = self[pos][l] 253 yi = motif[pos-offset][l] 254 sx = sx + xi 255 sy = sy + yi 256 sxx = sxx + xi * xi 257 syy = syy + yi * yi 258 sxy = sxy + xi * yi 259 260 norm *= len(self.alphabet.letters) 261 s1 = (sxy - sx*sy*1.0/norm) 262 s2 = (norm*sxx - sx*sx*1.0)*(norm*syy- sy*sy*1.0) 263 return s1/math.sqrt(s2)
264
265 - def dist_product(self,other):
266 """ 267 A similarity measure taking into account a product probability of generating overlaping instances of two motifs 268 """ 269 max_p=0.0 270 for offset in range(-self.length+1,other.length): 271 if offset<0: 272 p = self.dist_product_at(other,-offset) 273 else: #offset>=0 274 p = other.dist_product_at(self,offset) 275 if max_p<p: 276 max_p=p 277 max_o=-offset 278 return 1-max_p/self.dist_product_at(self,0),max_o
279
280 - def dist_product_at(self,other,offset):
281 s=0 282 for i in range(max(self.length,offset+other.length)): 283 f1=self[i] 284 f2=other[i-offset] 285 for n,b in self.background.items(): 286 s+=b*f1[n]*f2[n] 287 return s/i
288
289 - def dist_dpq(self,other):
290 r"""Calculates the DPQ distance measure between motifs. 291 292 It is calculated as a maximal value of DPQ formula (shown using LaTeX 293 markup, familiar to mathematicians): 294 295 \sqrt{\sum_{i=1}^{alignment.len()} \sum_{k=1}^alphabet.len() \ 296 \{ m1[i].freq(alphabet[k])*log_2(m1[i].freq(alphabet[k])/m2[i].freq(alphabet[k])) + 297 m2[i].freq(alphabet[k])*log_2(m2[i].freq(alphabet[k])/m1[i].freq(alphabet[k])) 298 } 299 300 over possible non-spaced alignemts of two motifs. See this reference: 301 302 D. M Endres and J. E Schindelin, "A new metric for probability 303 distributions", IEEE transactions on Information Theory 49, no. 7 304 (July 2003): 1858-1860. 305 """ 306 307 min_d=float("inf") 308 min_o=-1 309 d_s=[] 310 for offset in range(-self.length+1,other.length): 311 #print "%2.3d"%offset, 312 if offset<0: 313 d = self.dist_dpq_at(other,-offset) 314 overlap = self.length+offset 315 else: #offset>=0 316 d = other.dist_dpq_at(self,offset) 317 overlap = other.length-offset 318 overlap = min(self.length,other.length,overlap) 319 out = self.length+other.length-2*overlap 320 #print d,1.0*(overlap+out)/overlap,d*(overlap+out)/overlap 321 #d = d/(2*overlap) 322 d = (d/(out+overlap))*(2*overlap+out)/(2*overlap) 323 #print d 324 d_s.append((offset,d)) 325 if min_d> d: 326 min_d=d 327 min_o=-offset 328 return min_d,min_o#,d_s
329
330 - def dist_dpq_at(self,other,offset):
331 """ 332 calculates the dist_dpq measure with a given offset. 333 334 offset should satisfy 0<=offset<=self.length 335 """ 336 def dpq (f1,f2,alpha): 337 s=0 338 for n in alpha.letters: 339 avg=(f1[n]+f2[n])/2 340 s+=f1[n]*math.log(f1[n]/avg,2)+f2[n]*math.log(f2[n]/avg,2) 341 return math.sqrt(s)
342 343 s=0 344 for i in range(max(self.length,offset+other.length)): 345 f1=self[i] 346 f2=other[i-offset] 347 s+=dpq(f1,f2,self.alphabet) 348 return s
349
350 - def _read(self,stream):
351 """Reads the motif from the stream (in AlignAce format). 352 353 the self.alphabet variable must be set beforehand. 354 If the last line contains asterisks it is used for setting mask 355 """ 356 357 while 1: 358 ln = stream.readline() 359 if "*" in ln: 360 self.set_mask(ln.strip("\n\c")) 361 break 362 self.add_instance(Seq(ln.strip(),self.alphabet))
363
364 - def __str__(self,masked=False):
365 """ string representation of a motif. 366 """ 367 str = "" 368 for inst in self.instances: 369 str = str + inst.tostring() + "\n" 370 371 if masked: 372 for i in xrange(self.length): 373 if self.mask[i]: 374 str = str + "*" 375 else: 376 str = str + " " 377 str = str + "\n" 378 return str
379
380 - def __len__(self):
381 """return the length of a motif 382 """ 383 if self.length==None: 384 return 0 385 else: 386 return self.length
387
388 - def _write(self,stream):
389 """ 390 writes the motif to the stream 391 """ 392 393 stream.write(self.__str__())
394 395 396
397 - def _to_fasta(self):
398 """ 399 FASTA representation of motif 400 """ 401 if not self.has_instances: 402 self.make_instances_from_counts() 403 str = "" 404 for i,inst in enumerate(self.instances): 405 str = str + "> instance %d\n"%i + inst.tostring() + "\n" 406 407 return str
408
409 - def reverse_complement(self):
410 """ 411 Gives the reverse complement of the motif 412 """ 413 res = Motif() 414 if self.has_instances: 415 for i in self.instances: 416 res.add_instance(i.reverse_complement()) 417 else: # has counts 418 res.has_counts=True 419 res.counts["A"]=self.counts["T"] 420 res.counts["T"]=self.counts["A"] 421 res.counts["G"]=self.counts["C"] 422 res.counts["C"]=self.counts["G"] 423 res.counts["A"].reverse() 424 res.counts["C"].reverse() 425 res.counts["G"].reverse() 426 res.counts["T"].reverse() 427 res.length=self.length 428 res.mask = self.mask 429 return res
430 431
432 - def _from_jaspar_pfm(self,stream,make_instances=False):
433 """ 434 reads the motif from Jaspar .pfm file 435 436 The instances are fake, but the pwm is accurate. 437 """ 438 return self._from_horiz_matrix(stream,letters="ACGT",make_instances=make_instances)
439
440 - def _from_vert_matrix(self,stream,letters=None,make_instances=False):
441 """reads a vertical count matrix from stream and fill in the counts. 442 """ 443 444 self.counts = {} 445 self.has_counts=True 446 if letters==None: 447 letters=self.alphabet.letters 448 self.length=0 449 for i in letters: 450 self.counts[i]=[] 451 for ln in stream.readlines(): 452 rec=map(float,ln.strip().split()) 453 for k,v in zip(letters,rec): 454 self.counts[k].append(v) 455 self.length+=1 456 self.set_mask("*"*self.length) 457 if make_instances==True: 458 self.make_instances_from_counts() 459 return self
460
461 - def _from_horiz_matrix(self,stream,letters=None,make_instances=False):
462 """reads a horizontal count matrix from stream and fill in the counts. 463 """ 464 if letters==None: 465 letters=self.alphabet.letters 466 self.counts = {} 467 self.has_counts=True 468 469 for i in letters: 470 ln = stream.readline().strip().split() 471 #if there is a letter in the beginning, ignore it 472 if ln[0]==i: 473 ln=ln[1:] 474 #print ln 475 try: 476 self.counts[i]=map(int,ln) 477 except ValueError: #not integers 478 self.counts[i]=map(float,ln) #map(lambda s: int(100*float(s)),ln) 479 #print counts[i] 480 481 s = sum(map(lambda nuc: self.counts[nuc][0],letters)) 482 #print "sum", s 483 l = len(self.counts[letters[0]]) 484 self.length=l 485 self.set_mask("*"*l) 486 if make_instances==True: 487 self.make_instances_from_counts() 488 return self
489 490
491 - def make_instances_from_counts(self):
492 """Creates "fake" instances for a motif created from a count matrix. 493 494 In case the sums of counts are different for different columnes, the shorter columns are padded with background. 495 """ 496 alpha="".join(self.alphabet.letters) 497 #col[i] is a column taken from aligned motif instances 498 col=[] 499 self.has_instances=True 500 self.instances=[] 501 s = sum(map(lambda nuc: self.counts[nuc][0],self.alphabet.letters)) 502 for i in range(self.length): 503 col.append("") 504 for n in self.alphabet.letters: 505 col[i] = col[i]+ (n*(self.counts[n][i])) 506 if len(col[i])<s: 507 print "WARNING, column too short",len(col[i]),s 508 col[i]+=(alpha*s)[:(s-len(col[i]))] 509 #print i,col[i] 510 #iterate over instances 511 for i in range(s): 512 inst="" #start with empty seq 513 for j in range(self.length): #iterate over positions 514 inst+=col[j][i] 515 #print i,inst 516 inst=Seq(inst,self.alphabet) 517 self.add_instance(inst) 518 return self.instances
519
520 - def make_counts_from_instances(self):
521 """Creates the count matrix for a motif with instances. 522 523 """ 524 #make strings for "columns" of motifs 525 #col[i] is a column taken from aligned motif instances 526 counts={} 527 for a in self.alphabet.letters: 528 counts[a]=[] 529 self.has_counts=True 530 s = len(self.instances) 531 for i in range(self.length): 532 ci = dict(map(lambda a: (a,0),self.alphabet.letters)) 533 for inst in self.instances: 534 ci[inst[i]]+=1 535 for a in self.alphabet.letters: 536 counts[a].append(ci[a]) 537 self.counts=counts 538 return counts
539
540 - def _from_jaspar_sites(self,stream):
541 """ 542 reads the motif from Jaspar .sites file 543 544 The instances and pwm are OK. 545 """ 546 547 while True: 548 ln = stream.readline()# read the header "$>...." 549 if ln=="" or ln[0]!=">": 550 break 551 552 ln=stream.readline().strip()#read the actual sequence 553 i=0 554 while ln[i]==ln[i].lower(): 555 i+=1 556 inst="" 557 while i<len(ln) and ln[i]==ln[i].upper(): 558 inst+=ln[i] 559 i+=1 560 inst=Seq(inst,self.alphabet) 561 self.add_instance(inst) 562 563 self.set_mask("*"*len(inst)) 564 return self
565 566
567 - def __getitem__(self,index):
568 """Returns the probability distribution over symbols at a given position, padding with background. 569 570 If the requested index is out of bounds, the returned distribution comes from background. 571 """ 572 if index in range(self.length): 573 return self.pwm()[index] 574 else: 575 return self.background
576
577 - def consensus(self):
578 """Returns the consensus sequence of a motif. 579 """ 580 res="" 581 for i in range(self.length): 582 max_f=0 583 max_n="X" 584 for n in self[i].keys(): 585 if self[i][n]>max_f: 586 max_f=self[i][n] 587 max_n=n 588 res+=max_n 589 return Seq(res,self.alphabet)
590
591 - def anticonsensus(self):
592 """returns the least probable pattern to be generated from this motif. 593 """ 594 res="" 595 for i in range(self.length): 596 min_f=10.0 597 min_n="X" 598 for n in self[i].keys(): 599 if self[i][n]<min_f: 600 min_f=self[i][n] 601 min_n=n 602 res+=min_n 603 return Seq(res,self.alphabet)
604
605 - def max_score(self):
606 """Maximal possible score for this motif. 607 608 returns the score computed for the consensus sequence. 609 """ 610 return self.score_hit(self.consensus(),0)
611
612 - def min_score(self):
613 """Minimal possible score for this motif. 614 615 returns the score computed for the anticonsensus sequence. 616 """ 617 return self.score_hit(self.anticonsensus(),0)
618
619 - def weblogo(self,fname,format="PNG",**kwds):
620 """ 621 uses the Berkeley weblogo service to download and save a weblogo of itself 622 623 requires an internet connection. 624 The parameters from **kwds are passed directly to the weblogo server. 625 """ 626 import urllib 627 import urllib2 628 al= self._to_fasta() 629 url = 'http://weblogo.berkeley.edu/logo.cgi' 630 values = {'sequence' : al, 631 'format' : format, 632 'logowidth' : '18', 633 'logoheight' : '5', 634 'logounits' : 'cm', 635 'kind' : 'AUTO', 636 'firstnum' : "1", 637 'command' : 'Create Logo', 638 'smallsamplecorrection' : "on", 639 'symbolsperline' : 32, 640 'res' : '96', 641 'res_units' : 'ppi', 642 'antialias' : 'on', 643 'title' : '', 644 'barbits' : '', 645 'xaxis': 'on', 646 'xaxis_label' : '', 647 'yaxis': 'on', 648 'yaxis_label' : '', 649 'showends' : 'on', 650 'shrink' : '0.5', 651 'fineprint' : 'on', 652 'ticbits' : '1', 653 'colorscheme' : 'DEFAULT', 654 'color1' : 'green', 655 'color2' : 'blue', 656 'color3' : 'red', 657 'color4' : 'black', 658 'color5' : 'purple', 659 'color6' : 'orange', 660 'color1' : 'black', 661 } 662 for k,v in kwds.items(): 663 values[k]=str(v) 664 665 data = urllib.urlencode(values) 666 req = urllib2.Request(url, data) 667 response = urllib2.urlopen(req) 668 f=open(fname,"w") 669 im=response.read() 670 671 f.write(im) 672 f.close()
673 674
675 - def _to_transfac(self):
676 """Write the representation of a motif in TRANSFAC format 677 """ 678 res="XX\nTY Motif\n" #header 679 try: 680 res+="ID %s\n"%self.name 681 except: 682 pass 683 res+="BF undef\nP0" 684 for a in self.alphabet.letters: 685 res+=" %s"%a 686 res+="\n" 687 if not self.has_counts: 688 self.make_counts_from_instances() 689 for i in range(self.length): 690 if i<9: 691 res+="0%d"%(i+1) 692 else: 693 res+="%d"%(i+1) 694 for a in self.alphabet.letters: 695 res+=" %d"%self.counts[a][i] 696 res+="\n" 697 res+="XX\n" 698 return res
699
700 - def _to_vertical_matrix(self,letters=None):
701 """Return string representation of the motif as a matrix. 702 703 """ 704 if letters==None: 705 letters=self.alphabet.letters 706 self._pwm_is_current=False 707 pwm=self.pwm(laplace=False) 708 res="" 709 for i in range(self.length): 710 res+="\t".join([str(pwm[i][a]) for a in letters]) 711 res+="\n" 712 return res
713
714 - def _to_horizontal_matrix(self,letters=None,normalized=True):
715 """Return string representation of the motif as a matrix. 716 717 """ 718 if letters==None: 719 letters=self.alphabet.letters 720 res="" 721 if normalized: #output PWM 722 self._pwm_is_current=False 723 mat=self.pwm(laplace=False) 724 for a in letters: 725 res+="\t".join([str(mat[i][a]) for i in range(self.length)]) 726 res+="\n" 727 else: #output counts 728 if not self.has_counts: 729 self.make_counts_from_instances() 730 mat=self.counts 731 for a in letters: 732 res+="\t".join([str(mat[a][i]) for i in range(self.length)]) 733 res+="\n" 734 return res
735
736 - def _to_jaspar_pfm(self):
737 """Returns the pfm representation of the motif 738 """ 739 return self._to_horizontal_matrix(normalized=False,letters="ACGT")
740
741 - def format(self,format):
742 """Returns a string representation of the Motif in a given format 743 744 Currently supported fromats: 745 - jaspar-pfm : JASPAR Position Frequency Matrix 746 - transfac : TRANSFAC like files 747 - fasta : FASTA file with instances 748 """ 749 750 formatters={ 751 "jaspar-pfm": self._to_jaspar_pfm, 752 "transfac": self._to_transfac, 753 "fasta" : self._to_fasta, 754 } 755 756 try: 757 return formatters[format]() 758 except KeyError: 759 raise ValueError("Wrong format type")
760