Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

  1  # Copyright 2000-2002 Brad Chapman. 
  2  # Copyright 2004-2005 by M de Hoon. 
  3  # Copyright 2007 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  import string, array 
  9   
 10  import Alphabet 
 11  from Alphabet import IUPAC 
 12  from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
 13  from Bio.Data import CodonTable 
 14   
15 -class Seq:
16 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
17 # Enforce string storage 18 assert (type(data) == type("") or # must use a string 19 type(data) == type(u"")) # but can be a unicode string 20 21 self.data = data # Seq API requirement 22 self.alphabet = alphabet # Seq API requirement
23
24 - def __repr__(self):
25 """Returns a (truncated) representation of the sequence for debugging.""" 26 if len(self) > 60 : 27 #Shows the last three letters as it is often useful to see if there 28 #is a stop codon at the end of a sequence. 29 #Note total length is 54+3+3=60 30 return "%s('%s...%s', %s)" % (self.__class__.__name__, 31 self.data[:54], self.data[-3:], 32 repr(self.alphabet)) 33 else : 34 return "%s(%s, %s)" % (self.__class__.__name__, 35 repr(self.data), 36 repr(self.alphabet))
37 - def __str__(self):
38 """Returns the full sequence as a python string. 39 40 Note that Biopython 1.44 and earlier would give a truncated 41 version of repr(my_seq) for str(my_seq). If you are writing code 42 which need to be backwards compatible with old Biopython, you 43 should continue to use my_seq.tostring() rather than str(my_seq) 44 """ 45 return self.data
46 47 # I don't think I like this method... 48 ## def __cmp__(self, other): 49 ## if isinstance(other, Seq): 50 ## return cmp(self.data, other.data) 51 ## else: 52 ## return cmp(self.data, other) 53
54 - def __len__(self): return len(self.data) # Seq API requirement
55
56 - def __getitem__(self, index) : # Seq API requirement
57 #Note since Python 2.0, __getslice__ is deprecated 58 #and __getitem__ is used instead. 59 #See http://docs.python.org/ref/sequence-methods.html 60 if isinstance(index, int) : 61 #Return a single letter as a string 62 return self.data[index] 63 else : 64 #Return the (sub)sequence as another Seq object 65 return Seq(self.data[index], self.alphabet)
66
67 - def __add__(self, other):
68 if type(other) == type(' '): 69 return self.__class__(self.data + other, self.alphabet) 70 elif self.alphabet.contains(other.alphabet): 71 return self.__class__(self.data + other.data, self.alphabet) 72 elif other.alphabet.contains(self.alphabet): 73 return self.__class__(self.data + other.data, other.alphabet) 74 else: 75 raise TypeError, ("incompatable alphabets", str(self.alphabet), 76 str(other.alphabet))
77
78 - def __radd__(self, other):
79 if self.alphabet.contains(other.alphabet): 80 return self.__class__(other.data + self.data, self.alphabet) 81 elif other.alphabet.contains(self.alphabet): 82 return self.__class__(other.data + self.data, other.alphabet) 83 else: 84 raise TypeError, ("incompatable alphabets", str(self.alphabet), 85 str(other.alphabet))
86 87
88 - def tostring(self): # Seq API requirement
89 """Returns the full sequence as a python string. 90 91 Although not formally deprecated, you are now encouraged to use 92 str(my_seq) instead of my_seq.tostring().""" 93 return self.data 94
95 - def tomutable(self): # Needed? Or use a function?
96 return MutableSeq(self.data, self.alphabet) 97
98 - def count(self, sub, start=None, end=None):
99 """Count method, like that of a python string. 100 101 Return an integer, the number of occurrences of substring 102 argument sub in the (sub)sequence given by [start:end]. 103 Optional arguments start and end are interpreted as in slice 104 notation. 105 106 sub - a string or another Seq object to look for 107 start - optional integer, slice start 108 end - optional integer, slice end 109 110 e.g. 111 from Bio.Seq import Seq 112 my_seq = Seq("AAAATGA") 113 print my_seq.count("A") 114 print my_seq.count("ATG") 115 print my_seq.count(Seq("AT")) 116 print my_seq.count("AT", 2, -1) 117 """ 118 try : 119 #Assume sub is a Seq or MutableSeq object, so pass 120 #it to the string's count method as a string. 121 #TODO - Should we check the alphabet? 122 search = sub.tostring() 123 except AttributeError: 124 #Assume sub is a string. 125 search = sub 126 127 #TODO - More elegant way of doing this splice notation 128 if start is None and end is None : 129 return self.data.count(search) 130 elif end is None : 131 return self.data.count(search, start) 132 else : 133 return self.data.count(search, start, end)
134
135 - def __maketrans(self, alphabet) :
136 """Seq.__maketrans(alphabet) -> translation table. 137 138 Return a translation table for use with complement() 139 and reverse_complement(). 140 141 Compatible with lower case and upper case sequences. 142 143 alphabet is a dictionary as implement in Data.IUPACData 144 145 For internal use only. 146 """ 147 before = ''.join(alphabet.keys()) 148 after = ''.join(alphabet.values()) 149 before = before + before.lower() 150 after = after + after.lower() 151 return string.maketrans(before, after)
152
153 - def complement(self):
154 """Returns the complement sequence. New Seq object. 155 """ 156 if isinstance(self.alphabet, Alphabet.ProteinAlphabet) : 157 raise ValueError, "Proteins do not have complements!" 158 if isinstance(self.alphabet, Alphabet.DNAAlphabet) : 159 d = ambiguous_dna_complement 160 elif isinstance(self.alphabet, Alphabet.RNAAlphabet) : 161 d = ambiguous_rna_complement 162 elif 'U' in self.data: 163 d = ambiguous_rna_complement 164 else: 165 d = ambiguous_dna_complement 166 ttable = self.__maketrans(d) 167 #Much faster on really long sequences than the previous loop based one. 168 #thx to Michael Palmer, University of Waterloo 169 s = self.data.translate(ttable) 170 return Seq(s, self.alphabet)
171
172 - def reverse_complement(self):
173 """Returns the reverse complement sequence. New Seq object. 174 """ 175 #Use -1 stride to reverse the complement 176 return self.complement()[::-1]
177
178 -class MutableSeq:
179 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
180 if type(data) == type(""): 181 self.data = array.array("c", data) 182 else: 183 self.data = data # assumes the input is an array 184 self.alphabet = alphabet
185 - def __repr__(self):
186 """Returns a (truncated) representation of the sequence for debugging.""" 187 if len(self) > 60 : 188 #Shows the last three letters as it is often useful to see if there 189 #is a stop codon at the end of a sequence. 190 #Note total length is 54+3+3=60 191 return "%s('%s...%s', %s)" % (self.__class__.__name__, 192 str(self[:54]), str(self[-3:]), 193 repr(self.alphabet)) 194 else : 195 return "%s('%s', %s)" % (self.__class__.__name__, 196 str(self), 197 repr(self.alphabet))
198
199 - def __str__(self):
200 """Returns the full sequence as a python string. 201 202 Note that Biopython 1.44 and earlier would give a truncated 203 version of repr(my_seq) for str(my_seq). If you are writing code 204 which need to be backwards compatible with old Biopython, you 205 should continue to use my_seq.tostring() rather than str(my_seq). 206 """ 207 return "".join(self.data)
208
209 - def __cmp__(self, other):
210 if isinstance(other, MutableSeq): 211 x = cmp(self.alphabet, other.alphabet) 212 if x == 0: 213 return cmp(self.data, other.data) 214 return x 215 elif type(other) == type(""): 216 return cmp(self.data.tostring(), other) 217 elif isinstance(other, Seq): 218 x = cmp(self.alphabet, other.alphabet) 219 if x == 0: 220 return cmp(self.data.tostring(), other.data) 221 return x 222 else: 223 return cmp(self.data, other)
224
225 - def __len__(self): return len(self.data)
226
227 - def __getitem__(self, index) :
228 #Note since Python 2.0, __getslice__ is deprecated 229 #and __getitem__ is used instead. 230 #See http://docs.python.org/ref/sequence-methods.html 231 if isinstance(index, int) : 232 #Return a single letter as a string 233 return self.data[index] 234 else : 235 #Return the (sub)sequence as another Seq object 236 return MutableSeq(self.data[index], self.alphabet)
237
238 - def __setitem__(self, index, value):
239 #Note since Python 2.0, __setslice__ is deprecated 240 #and __setitem__ is used instead. 241 #See http://docs.python.org/ref/sequence-methods.html 242 if isinstance(index, int) : 243 #Replacing a single letter with a new string 244 self.data[index] = value 245 else : 246 #Replacing a sub-sequence 247 if isinstance(value, MutableSeq): 248 self.data[index] = value.data 249 elif isinstance(value, type(self.data)): 250 self.data[index] = value 251 else: 252 self.data[index] = array.array("c", str(value))
253
254 - def __delitem__(self, index):
255 #Note since Python 2.0, __delslice__ is deprecated 256 #and __delitem__ is used instead. 257 #See http://docs.python.org/ref/sequence-methods.html 258 259 #Could be deleting a single letter, or a slice 260 del self.data[index]
261
262 - def __add__(self, other):
263 if self.alphabet.contains(other.alphabet): 264 return self.__class__(self.data + other.data, self.alphabet) 265 elif other.alphabet.contains(self.alphabet): 266 return self.__class__(self.data + other.data, other.alphabet) 267 else: 268 raise TypeError, ("incompatable alphabets", str(self.alphabet), 269 str(other.alphabet))
270 - def __radd__(self, other):
271 if self.alphabet.contains(other.alphabet): 272 return self.__class__(other.data + self.data, self.alphabet) 273 elif other.alphabet.contains(self.alphabet): 274 return self.__class__(other.data + self.data, other.alphabet) 275 else: 276 raise TypeError, ("incompatable alphabets", str(self.alphabet), 277 str(other.alphabet))
278
279 - def append(self, c):
280 self.data.append(c)
281 - def insert(self, i, c):
282 self.data.insert(i, c)
283 - def pop(self, i = (-1)):
284 c = self.data[i] 285 del self.data[i] 286 return c
287 - def remove(self, item):
288 for i in range(len(self.data)): 289 if self.data[i] == item: 290 del self.data[i] 291 return 292 raise ValueError, "MutableSeq.remove(x): x not in list"
293
294 - def count(self, sub, start=None, end=None):
295 """Count method, like that of a python string. 296 297 Return an integer, the number of occurrences of substring 298 argument sub in the (sub)sequence given by [start:end]. 299 Optional arguments start and end are interpreted as in slice 300 notation. 301 302 sub - a string or another Seq object to look for 303 start - optional integer, slice start 304 end - optional integer, slice end 305 306 e.g. 307 from Bio.Seq import MutableSeq 308 my_mseq = MutableSeq("AAAATGA") 309 print my_mseq.count("A") 310 print my_mseq.count("ATG") 311 print my_mseq.count(Seq("AT")) 312 print my_mseq.count("AT", 2, -1) 313 """ 314 if len(sub) == 1 : 315 #Try and be efficient and work directly from the array. 316 try : 317 #Assume item is a single letter Seq/MutableSeq 318 #TODO - Should we check the alphabet? 319 letter = sub.tostring() 320 except AttributeError : 321 letter = sub 322 323 count = 0 324 #TODO - More elegant way of doing this splice notation 325 if start is None and end is None : 326 for c in self.data: 327 if c == letter: count += 1 328 elif end is None : 329 for c in self.data[start:]: 330 if c == letter: count += 1 331 else : 332 for c in self.data[start:end]: 333 if c == letter: count += 1 334 return count 335 else : 336 #TODO - Can we do this more efficiently? 337 #We could use the self.tostring().count(...) method 338 return self.toseq().count(sub, start, end)
339
340 - def index(self, item):
341 for i in range(len(self.data)): 342 if self.data[i] == item: 343 return i 344 raise ValueError, "MutableSeq.index(x): x not in list"
345
346 - def reverse(self):
347 """Modify the MutableSequence to reverse itself. 348 349 No return value""" 350 self.data.reverse()
351
352 - def complement(self):
353 """Modify the MutableSequence to take on its complement. 354 355 No return value""" 356 if isinstance(self.alphabet, Alphabet.ProteinAlphabet) : 357 raise ValueError, "Proteins do not have complements!" 358 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 359 d = ambiguous_dna_complement 360 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 361 d = ambiguous_rna_complement 362 elif 'U' in self.data: 363 d = ambiguous_rna_complement 364 else: 365 d = ambiguous_dna_complement 366 c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()]) 367 d.update(c) 368 self.data = map(lambda c: d[c], self.data) 369 self.data = array.array('c', self.data)
370
371 - def reverse_complement(self):
372 """Modify the MutableSequence to take on its reverse complement. 373 374 No return value""" 375 if isinstance(self.alphabet, Alphabet.ProteinAlphabet) : 376 raise ValueError, "Proteins do not have complements!" 377 self.complement() 378 self.data.reverse()
379 380 ## Sorting a sequence makes no sense. 381 # def sort(self, *args): self.data.sort(*args) 382
383 - def extend(self, other):
384 if isinstance(other, MutableSeq): 385 for c in other.data: 386 self.data.append(c) 387 else: 388 for c in other: 389 self.data.append(c)
390
391 - def tostring(self):
392 """Returns the full sequence as a python string. 393 394 Although not formally deprecated, you are now encouraged to use 395 str(my_seq) instead of my_seq.tostring().""" 396 return "".join(self.data)
397
398 - def toseq(self):
399 """Returns the full sequence as a new immutable Seq object""" 400 return Seq("".join(self.data), self.alphabet)
401 402 403 # The transcribe, backward_transcribe, and translate functions are 404 # user-friendly versions of the corresponding functions in Bio.Transcribe 405 # and Bio.Translate. The functions work both on Seq objects, and on strings. 406
407 -def transcribe(dna):
408 """Transcribes a DNA sequence into RNA. 409 410 If given a string, returns a new string object. 411 Given a Seq or MutableSeq, returns a new Seq object with the same alphabet. 412 """ 413 if isinstance(dna, Seq) or isinstance(dna, MutableSeq): 414 if isinstance(dna.alphabet, Alphabet.ProteinAlphabet) : 415 raise ValueError, "Proteins cannot be transcribed!" 416 #TODO - Raise an error if already is RNA alphabet? 417 rna = dna.tostring().replace('T','U').replace('t','u') 418 if dna.alphabet==IUPAC.unambiguous_dna: 419 alphabet = IUPAC.unambiguous_rna 420 elif dna.alphabet==IUPAC.ambiguous_dna: 421 alphabet = IUPAC.ambiguous_rna 422 else: 423 alphabet = Alphabet.generic_rna 424 return Seq(rna, alphabet) 425 else: 426 rna = dna.replace('T','U').replace('t','u') 427 return rna
428 429
430 -def back_transcribe(rna):
431 """Back-transcribes an RNA sequence into DNA. 432 433 If given a string, returns a new string object. 434 Given a Seq or MutableSeq, returns a new Seq object with the same alphabet. 435 """ 436 if isinstance(rna, Seq) or isinstance(rna, MutableSeq): 437 if isinstance(rna.alphabet, Alphabet.ProteinAlphabet) : 438 raise ValueError, "Proteins cannot be (back)transcribed!" 439 #TODO - Raise an error if already is DNA alphabet? 440 dna = rna.data.replace('U','T').replace('u','t') 441 if rna.alphabet==IUPAC.unambiguous_rna: 442 alphabet = IUPAC.unambiguous_dna 443 elif rna.alphabet==IUPAC.ambiguous_rna: 444 alphabet = IUPAC.ambiguous_dna 445 else: 446 alphabet = Alphabet.generic_dna 447 return Seq(dna, alphabet) 448 else: 449 dna = rna.replace('U','T').replace('u','t') 450 return dna
451 452
453 -def translate(sequence, table = "Standard", stop_symbol = "*"):
454 """Translate a nucleotide sequence into amino acids. 455 456 If given a string, returns a new string object. 457 Given a Seq or MutableSeq, returns a Seq object. 458 459 table - Which codon table to use? This can be either a name 460 (string) or an identifier (integer) 461 462 NOTE - Does NOT support ambiguous nucleotide sequences which 463 could code for a stop codon. This will throw a TranslationError. 464 465 NOTE - Does NOT support gapped sequences. 466 467 It will however translate either DNA or RNA.""" 468 try: 469 id = int(table) 470 except: 471 id = None 472 if isinstance(sequence, Seq) or isinstance(sequence, MutableSeq): 473 if isinstance(sequence.alphabet, Alphabet.ProteinAlphabet) : 474 raise ValueError, "Proteins cannot be translated!" 475 if sequence.alphabet==IUPAC.unambiguous_dna: 476 if id==None: 477 table = CodonTable.unambiguous_dna_by_name[table] 478 else: 479 table = CodonTable.unambiguous_dna_by_id[id] 480 elif sequence.alphabet==IUPAC.ambiguous_dna: 481 if id==None: 482 table = CodonTable.ambiguous_dna_by_name[table] 483 else: 484 table = CodonTable.ambiguous_dna_by_id[id] 485 elif sequence.alphabet==IUPAC.unambiguous_rna: 486 if id==None: 487 table = CodonTable.unambiguous_rna_by_name[table] 488 else: 489 table = CodonTable.unambiguous_rna_by_id[id] 490 elif sequence.alphabet==IUPAC.ambiguous_rna: 491 if id==None: 492 table = CodonTable.ambiguous_rna_by_name[table] 493 else: 494 table = CodonTable.ambiguous_rna_by_id[id] 495 else: 496 if id==None: 497 table = CodonTable.ambiguous_generic_by_name[table] 498 else: 499 table = CodonTable.ambiguous_generic_by_id[id] 500 sequence = sequence.tostring().upper() 501 n = len(sequence) 502 get = table.forward_table.get 503 protein = [get(sequence[i:i+3], stop_symbol) for i in xrange(0,n-n%3,3)] 504 protein = "".join(protein) 505 if stop_symbol in protein : 506 alphabet = Alphabet.HasStopCodon(table.protein_alphabet, 507 stop_symbol = stop_symbol) 508 else : 509 alphabet = table.protein_alphabet 510 return Seq(protein, alphabet) 511 else: 512 if id==None: 513 table = CodonTable.ambiguous_generic_by_name[table] 514 else: 515 table = CodonTable.ambiguous_generic_by_id[id] 516 get = table.forward_table.get 517 sequence = sequence.upper() 518 n = len(sequence) 519 protein = [get(sequence[i:i+3], stop_symbol) for i in xrange(0,n-n%3,3)] 520 protein = "".join(protein) 521 return protein
522 523
524 -def reverse_complement(sequence):
525 """Returns the reverse complement sequence of a nucleotide string. 526 527 If given a string, returns a new string object. 528 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 529 530 Supports unambiguous nucleotide sequences 531 """ 532 if isinstance(sequence, Seq) : 533 #Return a Seq 534 return sequence.reverse_complement() 535 elif isinstance(sequence, MutableSeq) : 536 #Return a Seq 537 #Don't use the MutableSeq reverse_complement method as it is 'in place'. 538 return sequence.toseq().reverse_complement() 539 else : 540 #Assume its a string, turn it into a Seq, 541 #do the reverse complement, and turn this back to a string 542 #TODO - Find a more efficient way to do this without code duplication? 543 return Seq(sequence).reverse_complement().tostring()
544 545 if __name__ == "__main__" : 546 print "Quick self test" 547 from Bio.Data.IUPACData import ambiguous_dna_values, ambiguous_rna_values# 548 from Bio.Alphabet import generic_dna, generic_rna 549 from sets import Set 550 print ambiguous_dna_complement 551 for ambig_char, values in ambiguous_dna_values.iteritems() : 552 compl_values = reverse_complement(values)[::-1] 553 print "%s={%s} --> {%s}=%s" % \ 554 (ambig_char, values, compl_values, ambiguous_dna_complement[ambig_char]) 555 assert Set(compl_values) == Set(ambiguous_dna_values[ambiguous_dna_complement[ambig_char]]) 556 557 for s in ["".join(ambiguous_dna_values), 558 Seq("".join(ambiguous_dna_values)), 559 Seq("".join(ambiguous_dna_values), generic_dna), 560 "".join(ambiguous_rna_values), 561 Seq("".join(ambiguous_rna_values)), 562 Seq("".join(ambiguous_dna_values), generic_rna)]: 563 print "%s -> %s [RC]" % (repr(s), repr(reverse_complement(s))) 564 print "%s -> %s [RNA]" % (repr(s), repr(transcribe(s))) 565 print "%s -> %s [DNA]" % (repr(s), repr(back_transcribe(s))) 566 567 #Quick check of the count method 568 for letter in "ABCDEFGHIjklmnopqrstuvwyz" : 569 assert 1 == Seq(letter).count(letter) 570 assert 1 == MutableSeq(letter).count(letter) 571 my_str = "AAAATGACGGCGGCGGCT" 572 for my_obj in [my_str, Seq(my_str), MutableSeq(my_str)] : 573 assert 5 == my_obj.count("A") 574 assert 1 == my_obj.count("ATG") 575 assert 3 == my_obj.count("CG") 576 assert 2 == my_obj.count("A", 3, -5) 577 for my_obj in [Seq(my_str), MutableSeq(my_str)] : 578 assert 1 == my_obj.count(Seq("AT")) 579 assert 5 == my_obj.count(Seq("A")) 580 assert 3 == my_obj.count(Seq("CG")) 581 assert 2 == my_obj.count(Seq("A"), 3, -5) 582 assert 1 == my_obj.count(MutableSeq("AT")) 583 assert 5 == my_obj.count(MutableSeq("A")) 584 assert 3 == my_obj.count(MutableSeq("CG")) 585 assert 2 == my_obj.count(MutableSeq("A"), 3, -5) 586 for start in range(-len(my_str), len(my_str)) : 587 for end in range(-len(my_str), len(my_str)) : 588 c = my_str.count("A",start,end) 589 assert c == my_str[start:end].count("A") 590 assert c == my_obj.count("A",start,end) 591 assert c == my_obj[start:end].count("A") 592 #This one is a bit silly: 593 assert my_str[start:end:-1].count("A") == my_obj[start:end:-1].count("A") 594 595 print repr(translate(Seq("GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGT", 596 IUPAC.unambiguous_dna))) 597 print repr(translate(Seq("GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG", 598 IUPAC.unambiguous_dna))) 599 print repr(translate(Seq("GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG", 600 IUPAC.unambiguous_dna), stop_symbol="@")) 601 602 ambig = Set(IUPAC.IUPACAmbiguousDNA.letters) 603 for c1 in ambig : 604 for c2 in ambig : 605 for c3 in ambig : 606 values = Set([translate(a+b+c, table=1) \ 607 for a in ambiguous_dna_values[c1] \ 608 for b in ambiguous_dna_values[c2] \ 609 for c in ambiguous_dna_values[c3]]) 610 try : 611 t = translate(c1+c2+c3) 612 except CodonTable.TranslationError : 613 assert "*" in values 614 continue 615 if t=="*" : 616 assert values == Set(["*"]) 617 elif t=="X" : 618 assert len(values) > 1, \ 619 "translate('%s') = '%s' not '%s'" \ 620 % (c1+c2+c3, t, ",".join(values)) 621 elif t=="Z" : 622 assert values == Set(("E", "Q")) 623 elif t=="B" : 624 assert values == Set(["D", "N"]) 625 else : 626 assert values == Set(t) 627