Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2009 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Provides objects to represent biological sequences with alphabets. 
   9   
  10  See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial: 
  11   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html} 
  12   - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf} 
  13  """ 
  14  __docformat__ ="epytext en" #Don't just use plain text in epydoc API pages! 
  15   
  16  import string #for maketrans only 
  17  import array 
  18  import sys 
  19   
  20  #TODO - Remove this work around once we drop python 2.3 support 
  21  try: 
  22      set = set 
  23  except NameError: 
  24      from sets import Set as set 
  25   
  26  import Alphabet 
  27  from Alphabet import IUPAC 
  28  from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
  29  from Bio.Data import CodonTable 
  30   
31 -def _maketrans(complement_mapping) :
32 """Makes a python string translation table (PRIVATE). 33 34 Arguments: 35 - complement_mapping - a dictionary such as ambiguous_dna_complement 36 and ambiguous_rna_complement from Data.IUPACData. 37 38 Returns a translation table (a string of length 256) for use with the 39 python string's translate method to use in a (reverse) complement. 40 41 Compatible with lower case and upper case sequences. 42 43 For internal use only. 44 """ 45 before = ''.join(complement_mapping.keys()) 46 after = ''.join(complement_mapping.values()) 47 before = before + before.lower() 48 after = after + after.lower() 49 return string.maketrans(before, after)
50 51 _dna_complement_table = _maketrans(ambiguous_dna_complement) 52 _rna_complement_table = _maketrans(ambiguous_rna_complement) 53
54 -class Seq(object):
55 """A read-only sequence object (essentially a string with an alphabet). 56 57 Like normal python strings, our basic sequence object is immutable. 58 This prevents you from doing my_seq[5] = "A" for example, but does allow 59 Seq objects to be used as dictionary keys. 60 61 The Seq object provides a number of string like methods (such as count, 62 find, split and strip), which are alphabet aware where appropriate. 63 64 The Seq object also provides some biological methods, such as complement, 65 reverse_complement, transcribe, back_transcribe and translate (which are 66 not applicable to sequences with a protein alphabet). 67 """
68 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
69 """Create a Seq object. 70 71 Arguments: 72 - seq - Sequence, required (string) 73 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet 74 75 You will typically use Bio.SeqIO to read in sequences from files as 76 SeqRecord objects, whose sequence will be exposed as a Seq object via 77 the seq property. 78 79 However, will often want to create your own Seq objects directly: 80 81 >>> from Bio.Seq import Seq 82 >>> from Bio.Alphabet import IUPAC 83 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 84 ... IUPAC.protein) 85 >>> my_seq 86 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 87 >>> print my_seq 88 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 89 """ 90 # Enforce string storage 91 assert (type(data) == type("") or # must use a string 92 type(data) == type(u"")) # but can be a unicode string 93 self._data = data 94 self.alphabet = alphabet # Seq API requirement
95 96 # A data property is/was a Seq API requirement
97 - def _set_data(self, value) :
98 #TODO - In the next release, actually raise an exception? 99 #The Seq object is like a python string, it should be read only! 100 import warnings 101 warnings.warn("Writing to the Seq object's .data propery is deprecated.", 102 DeprecationWarning) 103 self._data = value
104 data = property(fget= lambda self : str(self), 105 fset=_set_data, 106 doc="Sequence as a string (DEPRECATED)") 107
108 - def __repr__(self):
109 """Returns a (truncated) representation of the sequence for debugging.""" 110 if len(self) > 60 : 111 #Shows the last three letters as it is often useful to see if there 112 #is a stop codon at the end of a sequence. 113 #Note total length is 54+3+3=60 114 return "%s('%s...%s', %s)" % (self.__class__.__name__, 115 str(self)[:54], str(self)[-3:], 116 repr(self.alphabet)) 117 else : 118 return "%s(%s, %s)" % (self.__class__.__name__, 119 repr(self.data), 120 repr(self.alphabet))
121 - def __str__(self):
122 """Returns the full sequence as a python string. 123 124 Note that Biopython 1.44 and earlier would give a truncated 125 version of repr(my_seq) for str(my_seq). If you are writing code 126 which need to be backwards compatible with old Biopython, you 127 should continue to use my_seq.tostring() rather than str(my_seq). 128 """ 129 return self._data
130 131 """ 132 TODO - Work out why this breaks test_Restriction.py 133 (Comparing Seq objects would be nice to have. May need to think about 134 hashes and the in operator for when have list/dictionary of Seq objects...) 135 def __cmp__(self, other): 136 if hasattr(other, "alphabet") : 137 #other should be a Seq or a MutableSeq 138 if not Alphabet._check_type_compatible([self.alphabet, 139 other.alphabet]) : 140 raise TypeError("Incompatable alphabets %s and %s" \ 141 % (repr(self.alphabet), repr(other.alphabet))) 142 #They should be the same sequence type (or one of them is generic) 143 return cmp(str(self), str(other)) 144 elif isinstance(other, basestring) : 145 return cmp(str(self), other) 146 else : 147 raise TypeError 148 """ 149
150 - def __len__(self): return len(self._data) # Seq API requirement
151
152 - def __getitem__(self, index) : # Seq API requirement
153 #Note since Python 2.0, __getslice__ is deprecated 154 #and __getitem__ is used instead. 155 #See http://docs.python.org/ref/sequence-methods.html 156 if isinstance(index, int) : 157 #Return a single letter as a string 158 return self._data[index] 159 else : 160 #Return the (sub)sequence as another Seq object 161 return Seq(self._data[index], self.alphabet)
162
163 - def __add__(self, other):
164 """Add another sequence or string to this sequence.""" 165 if hasattr(other, "alphabet") : 166 #other should be a Seq or a MutableSeq 167 if not Alphabet._check_type_compatible([self.alphabet, 168 other.alphabet]) : 169 raise TypeError("Incompatable alphabets %s and %s" \ 170 % (repr(self.alphabet), repr(other.alphabet))) 171 #They should be the same sequence type (or one of them is generic) 172 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 173 return self.__class__(str(self) + str(other), a) 174 elif isinstance(other, basestring) : 175 #other is a plain string - use the current alphabet 176 return self.__class__(str(self) + other, self.alphabet) 177 else : 178 raise TypeError
179
180 - def __radd__(self, other):
181 if hasattr(other, "alphabet") : 182 #other should be a Seq or a MutableSeq 183 if not Alphabet._check_type_compatible([self.alphabet, 184 other.alphabet]) : 185 raise TypeError("Incompatable alphabets %s and %s" \ 186 % (repr(self.alphabet), repr(other.alphabet))) 187 #They should be the same sequence type (or one of them is generic) 188 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 189 return self.__class__(str(other) + str(self), a) 190 elif isinstance(other, basestring) : 191 #other is a plain string - use the current alphabet 192 return self.__class__(other + str(self), self.alphabet) 193 else : 194 raise TypeError
195
196 - def tostring(self): # Seq API requirement
197 """Returns the full sequence as a python string. 198 199 Although not formally deprecated, you are now encouraged to use 200 str(my_seq) instead of my_seq.tostring().""" 201 return str(self) 202
203 - def tomutable(self): # Needed? Or use a function?
204 """Returns the full sequence as a MutableSeq object. 205 206 >>> from Bio.Seq import Seq 207 >>> from Bio.Alphabet import IUPAC 208 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", 209 ... IUPAC.protein) 210 >>> my_seq 211 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 212 >>> my_seq.tomutable() 213 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 214 215 Note that the alphabet is preserved. 216 """ 217 return MutableSeq(str(self), self.alphabet) 218
219 - def _get_seq_str_and_check_alphabet(self, other_sequence) :
220 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 221 222 For a string argument, returns the string. 223 224 For a Seq or MutableSeq, it checks the alphabet is compatible 225 (raising an exception if it isn't), and then returns a string. 226 """ 227 try : 228 other_alpha = other_sequence.alphabet 229 except AttributeError : 230 #Assume other_sequence is a string 231 return other_sequence 232 233 #Other should be a Seq or a MutableSeq 234 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]) : 235 raise TypeError("Incompatable alphabets %s and %s" \ 236 % (repr(self.alphabet), repr(other_alpha))) 237 #Return as a string 238 return str(other_sequence)
239
240 - def count(self, sub, start=0, end=sys.maxint):
241 """Non-overlapping count method, like that of a python string. 242 243 This behaves like the python string method of the same name, 244 which does a non-overlapping count! 245 246 Returns an integer, the number of occurrences of substring 247 argument sub in the (sub)sequence given by [start:end]. 248 Optional arguments start and end are interpreted as in slice 249 notation. 250 251 Arguments: 252 - sub - a string or another Seq object to look for 253 - start - optional integer, slice start 254 - end - optional integer, slice end 255 256 e.g. 257 258 >>> from Bio.Seq import Seq 259 >>> my_seq = Seq("AAAATGA") 260 >>> print my_seq.count("A") 261 5 262 >>> print my_seq.count("ATG") 263 1 264 >>> print my_seq.count(Seq("AT")) 265 1 266 >>> print my_seq.count("AT", 2, -1) 267 1 268 269 HOWEVER, please note because python strings and Seq objects (and 270 MutableSeq objects) do a non-overlapping search, this may not give 271 the answer you expect: 272 273 >>> "AAAA".count("AA") 274 2 275 >>> print Seq("AAAA").count("AA") 276 2 277 278 A non-overlapping search would give the answer as three! 279 """ 280 #If it has one, check the alphabet: 281 sub_str = self._get_seq_str_and_check_alphabet(sub) 282 return str(self).count(sub_str, start, end)
283
284 - def find(self, sub, start=0, end=sys.maxint):
285 """Find method, like that of a python string. 286 287 This behaves like the python string method of the same name. 288 289 Returns an integer, the index of the first occurrence of substring 290 argument sub in the (sub)sequence given by [start:end]. 291 292 Arguments: 293 - sub - a string or another Seq object to look for 294 - start - optional integer, slice start 295 - end - optional integer, slice end 296 297 Returns -1 if the subsequence is NOT found. 298 299 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 300 301 >>> from Bio.Seq import Seq 302 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 303 >>> my_rna.find("AUG") 304 3 305 """ 306 #If it has one, check the alphabet: 307 sub_str = self._get_seq_str_and_check_alphabet(sub) 308 return str(self).find(sub_str, start, end)
309
310 - def rfind(self, sub, start=0, end=sys.maxint):
311 """Find from right method, like that of a python string. 312 313 This behaves like the python string method of the same name. 314 315 Returns an integer, the index of the last (right most) occurrence of 316 substring argument sub in the (sub)sequence given by [start:end]. 317 318 Arguments: 319 - sub - a string or another Seq object to look for 320 - start - optional integer, slice start 321 - end - optional integer, slice end 322 323 Returns -1 if the subsequence is NOT found. 324 325 e.g. Locating the last typical start codon, AUG, in an RNA sequence: 326 327 >>> from Bio.Seq import Seq 328 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 329 >>> my_rna.rfind("AUG") 330 15 331 """ 332 #If it has one, check the alphabet: 333 sub_str = self._get_seq_str_and_check_alphabet(sub) 334 return str(self).rfind(sub_str, start, end)
335
336 - def startswith(self, prefix, start=0, end=sys.maxint) :
337 """Does the Seq start with the given prefix? Returns True/False. 338 339 This behaves like the python string method of the same name. 340 341 Return True if the sequence starts with the specified prefix 342 (a string or another Seq object), False otherwise. 343 With optional start, test sequence beginning at that position. 344 With optional end, stop comparing sequence at that position. 345 prefix can also be a tuple of strings to try. e.g. 346 347 >>> from Bio.Seq import Seq 348 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 349 >>> my_rna.startswith("GUC") 350 True 351 >>> my_rna.startswith("AUG") 352 False 353 >>> my_rna.startswith("AUG", 3) 354 True 355 >>> my_rna.startswith(("UCC","UCA","UCG"),1) 356 True 357 """ 358 #If it has one, check the alphabet: 359 if isinstance(prefix, tuple) : 360 #TODO - Once we drop support for Python 2.4, instead of this 361 #loop offload to the string method (requires Python 2.5+). 362 #Check all the alphabets first... 363 prefix_strings = [self._get_seq_str_and_check_alphabet(p) \ 364 for p in prefix] 365 for prefix_str in prefix_strings : 366 if str(self).startswith(prefix_str, start, end) : 367 return True 368 return False 369 else : 370 prefix_str = self._get_seq_str_and_check_alphabet(prefix) 371 return str(self).startswith(prefix_str, start, end)
372
373 - def endswith(self, suffix, start=0, end=sys.maxint) :
374 """Does the Seq end with the given suffix? Returns True/False. 375 376 This behaves like the python string method of the same name. 377 378 Return True if the sequence ends with the specified suffix 379 (a string or another Seq object), False otherwise. 380 With optional start, test sequence beginning at that position. 381 With optional end, stop comparing sequence at that position. 382 suffix can also be a tuple of strings to try. e.g. 383 384 >>> from Bio.Seq import Seq 385 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 386 >>> my_rna.endswith("UUG") 387 True 388 >>> my_rna.endswith("AUG") 389 False 390 >>> my_rna.endswith("AUG", 0, 18) 391 True 392 >>> my_rna.endswith(("UCC","UCA","UUG")) 393 True 394 """ 395 #If it has one, check the alphabet: 396 if isinstance(suffix, tuple) : 397 #TODO - Once we drop support for Python 2.4, instead of this 398 #loop offload to the string method (requires Python 2.5+). 399 #Check all the alphabets first... 400 suffix_strings = [self._get_seq_str_and_check_alphabet(p) \ 401 for p in suffix] 402 for suffix_str in suffix_strings : 403 if str(self).endswith(suffix_str, start, end) : 404 return True 405 return False 406 else : 407 suffix_str = self._get_seq_str_and_check_alphabet(suffix) 408 return str(self).endswith(suffix_str, start, end)
409 410
411 - def split(self, sep=None, maxsplit=-1) :
412 """Split method, like that of a python string. 413 414 This behaves like the python string method of the same name. 415 416 Return a list of the 'words' in the string (as Seq objects), 417 using sep as the delimiter string. If maxsplit is given, at 418 most maxsplit splits are done. If maxsplit is ommited, all 419 splits are made. 420 421 Following the python string method, sep will by default be any 422 white space (tabs, spaces, newlines) but this is unlikely to 423 apply to biological sequences. 424 425 e.g. 426 427 >>> from Bio.Seq import Seq 428 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 429 >>> my_aa = my_rna.translate() 430 >>> my_aa 431 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 432 >>> my_aa.split("*") 433 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 434 >>> my_aa.split("*",1) 435 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 436 437 See also the rsplit method: 438 439 >>> my_aa.rsplit("*",1) 440 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 441 """ 442 #If it has one, check the alphabet: 443 sep_str = self._get_seq_str_and_check_alphabet(sep) 444 #TODO - If the sep is the defined stop symbol, or gap char, 445 #should we adjust the alphabet? 446 return [Seq(part, self.alphabet) \ 447 for part in str(self).split(sep_str, maxsplit)]
448
449 - def rsplit(self, sep=None, maxsplit=-1) :
450 """Right split method, like that of a python string. 451 452 This behaves like the python string method of the same name. 453 454 Return a list of the 'words' in the string (as Seq objects), 455 using sep as the delimiter string. If maxsplit is given, at 456 most maxsplit splits are done COUNTING FROM THE RIGHT. 457 If maxsplit is ommited, all splits are made. 458 459 Following the python string method, sep will by default be any 460 white space (tabs, spaces, newlines) but this is unlikely to 461 apply to biological sequences. 462 463 e.g. print my_seq.rsplit("*",1) 464 465 See also the split method. 466 """ 467 #If it has one, check the alphabet: 468 sep_str = self._get_seq_str_and_check_alphabet(sep) 469 try : 470 return [Seq(part, self.alphabet) \ 471 for part in str(self).rsplit(sep_str, maxsplit)] 472 except AttributeError : 473 #Python 2.3 doesn't have a string rsplit method, which we can 474 #word around by reversing the sequence, using (left) split, 475 #and then reversing the answer. Not very efficient! 476 words = [Seq(word[::-1], self.alphabet) for word \ 477 in str(self)[::-1].split(sep_str[::-1], maxsplit)] 478 words.reverse() 479 return words
480
481 - def strip(self, chars=None) :
482 """Returns a new Seq object with leading and trailing ends stripped. 483 484 This behaves like the python string method of the same name. 485 486 Optional argument chars defines which characters to remove. If 487 ommitted or None (default) then as for the python string method, 488 this defaults to removing any white space. 489 490 e.g. print my_seq.strip("-") 491 492 See also the lstrip and rstrip methods. 493 """ 494 #If it has one, check the alphabet: 495 strip_str = self._get_seq_str_and_check_alphabet(chars) 496 return Seq(str(self).strip(strip_str), self.alphabet)
497
498 - def lstrip(self, chars=None) :
499 """Returns a new Seq object with leading (left) end stripped. 500 501 This behaves like the python string method of the same name. 502 503 Optional argument chars defines which characters to remove. If 504 ommitted or None (default) then as for the python string method, 505 this defaults to removing any white space. 506 507 e.g. print my_seq.lstrip("-") 508 509 See also the strip and rstrip methods. 510 """ 511 #If it has one, check the alphabet: 512 strip_str = self._get_seq_str_and_check_alphabet(chars) 513 return Seq(str(self).lstrip(strip_str), self.alphabet)
514
515 - def rstrip(self, chars=None) :
516 """Returns a new Seq object with trailing (right) end stripped. 517 518 This behaves like the python string method of the same name. 519 520 Optional argument chars defines which characters to remove. If 521 ommitted or None (default) then as for the python string method, 522 this defaults to removing any white space. 523 524 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 525 526 >>> from Bio.Alphabet import IUPAC 527 >>> from Bio.Seq import Seq 528 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 529 >>> my_seq 530 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 531 >>> my_seq.rstrip("A") 532 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 533 534 See also the strip and lstrip methods. 535 """ 536 #If it has one, check the alphabet: 537 strip_str = self._get_seq_str_and_check_alphabet(chars) 538 return Seq(str(self).rstrip(strip_str), self.alphabet)
539
540 - def complement(self):
541 """Returns the complement sequence. New Seq object. 542 543 >>> from Bio.Seq import Seq 544 >>> from Bio.Alphabet import IUPAC 545 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 546 >>> my_dna 547 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 548 >>> my_dna.complement() 549 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 550 551 You can of course used mixed case sequences, 552 553 >>> from Bio.Seq import Seq 554 >>> from Bio.Alphabet import generic_dna 555 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna) 556 >>> my_dna 557 Seq('CCCCCgatA-GD', DNAAlphabet()) 558 >>> my_dna.complement() 559 Seq('GGGGGctaT-CH', DNAAlphabet()) 560 561 Note in the above example, ambiguous character D denotes 562 G, A or T so its complement is H (for C, T or A). 563 564 Trying to complement a protein sequence raises an exception. 565 566 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 567 >>> my_protein.complement() 568 Traceback (most recent call last): 569 ... 570 ValueError: Proteins do not have complements! 571 """ 572 base = Alphabet._get_base_alphabet(self.alphabet) 573 if isinstance(base, Alphabet.ProteinAlphabet) : 574 raise ValueError("Proteins do not have complements!") 575 if isinstance(base, Alphabet.DNAAlphabet) : 576 ttable = _dna_complement_table 577 elif isinstance(base, Alphabet.RNAAlphabet) : 578 ttable = _rna_complement_table 579 elif ('U' in self._data or 'u' in self._data) \ 580 and ('T' in self._data or 't' in self._data): 581 #TODO - Handle this cleanly? 582 raise ValueError("Mixed RNA/DNA found") 583 elif 'U' in self._data or 'u' in self._data: 584 ttable = _rna_complement_table 585 else: 586 ttable = _dna_complement_table 587 #Much faster on really long sequences than the previous loop based one. 588 #thx to Michael Palmer, University of Waterloo 589 return Seq(str(self).translate(ttable), self.alphabet)
590
591 - def reverse_complement(self):
592 """Returns the reverse complement sequence. New Seq object. 593 594 >>> from Bio.Seq import Seq 595 >>> from Bio.Alphabet import IUPAC 596 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna) 597 >>> my_dna 598 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA()) 599 >>> my_dna.reverse_complement() 600 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA()) 601 602 Note in the above example, since R = G or A, its complement 603 is Y (which denotes C or T). 604 605 You can of course used mixed case sequences, 606 607 >>> from Bio.Seq import Seq 608 >>> from Bio.Alphabet import generic_dna 609 >>> my_dna = Seq("CCCCCgatA-G", generic_dna) 610 >>> my_dna 611 Seq('CCCCCgatA-G', DNAAlphabet()) 612 >>> my_dna.reverse_complement() 613 Seq('C-TatcGGGGG', DNAAlphabet()) 614 615 Trying to complement a protein sequence raises an exception: 616 617 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 618 >>> my_protein.reverse_complement() 619 Traceback (most recent call last): 620 ... 621 ValueError: Proteins do not have complements! 622 """ 623 #Use -1 stride/step to reverse the complement 624 return self.complement()[::-1]
625
626 - def transcribe(self):
627 """Returns the RNA sequence from a DNA sequence. New Seq object. 628 629 >>> from Bio.Seq import Seq 630 >>> from Bio.Alphabet import IUPAC 631 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", 632 ... IUPAC.unambiguous_dna) 633 >>> coding_dna 634 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 635 >>> coding_dna.transcribe() 636 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 637 638 Trying to transcribe a protein or RNA sequence raises an exception: 639 640 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 641 >>> my_protein.transcribe() 642 Traceback (most recent call last): 643 ... 644 ValueError: Proteins cannot be transcribed! 645 """ 646 base = Alphabet._get_base_alphabet(self.alphabet) 647 if isinstance(base, Alphabet.ProteinAlphabet) : 648 raise ValueError("Proteins cannot be transcribed!") 649 if isinstance(base, Alphabet.RNAAlphabet) : 650 raise ValueError("RNA cannot be transcribed!") 651 652 if self.alphabet==IUPAC.unambiguous_dna: 653 alphabet = IUPAC.unambiguous_rna 654 elif self.alphabet==IUPAC.ambiguous_dna: 655 alphabet = IUPAC.ambiguous_rna 656 else: 657 alphabet = Alphabet.generic_rna 658 return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
659
660 - def back_transcribe(self):
661 """Returns the DNA sequence from an RNA sequence. New Seq object. 662 663 >>> from Bio.Seq import Seq 664 >>> from Bio.Alphabet import IUPAC 665 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", 666 ... IUPAC.unambiguous_rna) 667 >>> messenger_rna 668 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 669 >>> messenger_rna.back_transcribe() 670 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 671 672 Trying to back-transcribe a protein or DNA sequence raises an 673 exception: 674 675 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 676 >>> my_protein.back_transcribe() 677 Traceback (most recent call last): 678 ... 679 ValueError: Proteins cannot be back transcribed! 680 """ 681 base = Alphabet._get_base_alphabet(self.alphabet) 682 if isinstance(base, Alphabet.ProteinAlphabet) : 683 raise ValueError("Proteins cannot be back transcribed!") 684 if isinstance(base, Alphabet.DNAAlphabet) : 685 raise ValueError("DNA cannot be back transcribed!") 686 687 if self.alphabet==IUPAC.unambiguous_rna: 688 alphabet = IUPAC.unambiguous_dna 689 elif self.alphabet==IUPAC.ambiguous_rna: 690 alphabet = IUPAC.ambiguous_dna 691 else: 692 alphabet = Alphabet.generic_dna 693 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
694
695 - def translate(self, table="Standard", stop_symbol="*", to_stop=False):
696 """Turns a nucleotide sequence into a protein sequence. New Seq object. 697 698 This method will translate DNA or RNA sequences, and those with a 699 nucleotide or generic alphabet. Trying to translate a protein 700 sequence raises an exception. 701 702 Arguments: 703 - table - Which codon table to use? This can be either a name 704 (string) or an NCBI identifier (integer). This defaults 705 to the "Standard" table. 706 - stop_symbol - Single character string, what to use for terminators. 707 This defaults to the asterisk, "*". 708 - to_stop - Boolean, defaults to False meaning do a full translation 709 continuing on past any stop codons (translated as the 710 specified stop_symbol). If True, translation is 711 terminated at the first in frame stop codon (and the 712 stop_symbol is not appended to the returned protein 713 sequence). 714 715 e.g. Using the standard table: 716 717 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 718 >>> coding_dna.translate() 719 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 720 >>> coding_dna.translate(stop_symbol="@") 721 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 722 >>> coding_dna.translate(to_stop=True) 723 Seq('VAIVMGR', ExtendedIUPACProtein()) 724 725 Now using NCBI table 2, where TGA is not a stop codon: 726 727 >>> coding_dna.translate(table=2) 728 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 729 >>> coding_dna.translate(table=2, to_stop=True) 730 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 731 732 If the sequence has no in-frame stop codon, then the to_stop argument 733 has no effect: 734 735 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 736 >>> coding_dna2.translate() 737 Seq('LAIVMGR', ExtendedIUPACProtein()) 738 >>> coding_dna2.translate(to_stop=True) 739 Seq('LAIVMGR', ExtendedIUPACProtein()) 740 741 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 742 or a stop codon. These are translated as "X". Any invalid codon 743 (e.g. "TA?" or "T-A") will throw a TranslationError. 744 745 NOTE - Does NOT support gapped sequences. 746 747 NOTE - This does NOT behave like the python string's translate 748 method. For that use str(my_seq).translate(...) instead. 749 """ 750 try: 751 table_id = int(table) 752 except ValueError: 753 table_id = None 754 if isinstance(table, str) and len(table)==256 : 755 raise ValueError("The Seq object translate method DOES NOT take " \ 756 + "a 256 character string mapping table like " \ 757 + "the python string object's translate method. " \ 758 + "Use str(my_seq).translate(...) instead.") 759 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 760 Alphabet.ProteinAlphabet) : 761 raise ValueError("Proteins cannot be translated!") 762 if self.alphabet==IUPAC.unambiguous_dna: 763 #Will use standard IUPAC protein alphabet, no need for X 764 if table_id is None: 765 codon_table = CodonTable.unambiguous_dna_by_name[table] 766 else: 767 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 768 #elif self.alphabet==IUPAC.ambiguous_dna: 769 # if table_id is None: 770 # codon_table = CodonTable.ambiguous_dna_by_name[table] 771 # else: 772 # codon_table = CodonTable.ambiguous_dna_by_id[table_id] 773 elif self.alphabet==IUPAC.unambiguous_rna: 774 #Will use standard IUPAC protein alphabet, no need for X 775 if table_id is None: 776 codon_table = CodonTable.unambiguous_rna_by_name[table] 777 else: 778 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 779 #elif self.alphabet==IUPAC.ambiguous_rna: 780 # if table_id is None: 781 # codon_table = CodonTable.ambiguous_rna_by_name[table] 782 # else: 783 # codon_table = CodonTable.ambiguous_rna_by_id[table_id] 784 else: 785 #This will use the extend IUPAC protein alphabet with X etc. 786 #The same table can be used for RNA or DNA (we use this for 787 #translating strings). 788 if table_id is None: 789 codon_table = CodonTable.ambiguous_generic_by_name[table] 790 else: 791 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 792 protein = _translate_str(str(self), codon_table, stop_symbol, to_stop) 793 if stop_symbol in protein : 794 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, 795 stop_symbol = stop_symbol) 796 else : 797 alphabet = codon_table.protein_alphabet 798 return Seq(protein, alphabet)
799
800 -class UnknownSeq(Seq):
801 """A read-only sequence object of known length but unknown contents. 802 803 If you have an unknown sequence, you can represent this with a normal 804 Seq object, for example: 805 806 >>> my_seq = Seq("N"*5) 807 >>> my_seq 808 Seq('NNNNN', Alphabet()) 809 >>> len(my_seq) 810 5 811 >>> print my_seq 812 NNNNN 813 814 However, this is rather wasteful of memory (especially for large 815 sequences), which is where this class is most usefull: 816 817 >>> unk_five = UnknownSeq(5) 818 >>> unk_five 819 UnknownSeq(5, alphabet = Alphabet(), character = '?') 820 >>> len(unk_five) 821 5 822 >>> print(unk_five) 823 ????? 824 825 You can add unknown sequence together, provided their alphabets and 826 characters are compatible, and get another memory saving UnknownSeq: 827 828 >>> unk_four = UnknownSeq(4) 829 >>> unk_four 830 UnknownSeq(4, alphabet = Alphabet(), character = '?') 831 >>> unk_four + unk_five 832 UnknownSeq(9, alphabet = Alphabet(), character = '?') 833 834 If the alphabet or characters don't match up, the addition gives an 835 ordinary Seq object: 836 837 >>> unk_nnnn = UnknownSeq(4, character = "N") 838 >>> unk_nnnn 839 UnknownSeq(4, alphabet = Alphabet(), character = 'N') 840 >>> unk_nnnn + unk_four 841 Seq('NNNN????', Alphabet()) 842 843 Combining with a real Seq gives a new Seq object: 844 845 >>> known_seq = Seq("ACGT") 846 >>> unk_four + known_seq 847 Seq('????ACGT', Alphabet()) 848 >>> known_seq + unk_four 849 Seq('ACGT????', Alphabet()) 850 """
851 - def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None) :
852 """Create a new UnknownSeq object. 853 854 If character is ommited, it is determed from the alphabet, "N" for 855 nucleotides, "X" for proteins, and "?" otherwise. 856 """ 857 self._length = int(length) 858 if self._length < 0 : 859 #TODO - Block zero length UnknownSeq? You can just use a Seq! 860 raise ValueError("Length must not be negative.") 861 self.alphabet = alphabet 862 if character : 863 if len(character) != 1 : 864 raise ValueError("character argument should be a single letter string.") 865 self._character = character 866 else : 867 base = Alphabet._get_base_alphabet(alphabet) 868 #TODO? Check the case of the letters in the alphabet? 869 #We may have to use "n" instead of "N" etc. 870 if isinstance(base, Alphabet.NucleotideAlphabet) : 871 self._character = "N" 872 elif isinstance(base, Alphabet.ProteinAlphabet) : 873 self._character = "X" 874 else : 875 self._character = "?"
876
877 - def __len__(self) :
878 """Returns the stated length of the unknown sequence.""" 879 return self._length
880
881 - def __str__(self) :
882 """Returns the unknown sequence as full string of the given length.""" 883 return self._character * self._length
884
885 - def __repr__(self):
886 return "UnknownSeq(%i, alphabet = %s, character = %s)" \ 887 % (self._length, repr(self.alphabet), repr(self._character))
888
889 - def __add__(self, other) :
890 if isinstance(other, UnknownSeq) \ 891 and other._character == self._character : 892 #TODO - Check the alphabets match 893 return UnknownSeq(len(self)+len(other), 894 self.alphabet, self._character) 895 #Offload to the base class... 896 return Seq(str(self), self.alphabet) + other
897
898 - def __radd__(self, other) :
899 if isinstance(other, UnknownSeq) \ 900 and other._character == self._character : 901 #TODO - Check the alphabets match 902 return UnknownSeq(len(self)+len(other), 903 self.alphabet, self._character) 904 #Offload to the base class... 905 return other + Seq(str(self), self.alphabet)
906
907 - def __getitem__(self, index):
908 if isinstance(index, int) : 909 #TODO - Check the bounds without wasting memory 910 return str(self)[index] 911 else : 912 #TODO - Work out the length without wasting memory 913 return UnknownSeq(len(("#"*self._length)[index]), 914 self.alphabet, self._character)
915
916 - def count(self, sub, start=0, end=sys.maxint):
917 """Non-overlapping count method, like that of a python string. 918 919 This behaves like the python string (and Seq object) method of the 920 same name, which does a non-overlapping count! 921 922 Returns an integer, the number of occurrences of substring 923 argument sub in the (sub)sequence given by [start:end]. 924 Optional arguments start and end are interpreted as in slice 925 notation. 926 927 Arguments: 928 - sub - a string or another Seq object to look for 929 - start - optional integer, slice start 930 - end - optional integer, slice end 931 932 >>> "NNNN".count("N") 933 4 934 >>> Seq("NNNN").count("N") 935 4 936 >>> UnknownSeq(4, character="N").count("N") 937 4 938 >>> UnknownSeq(4, character="N").count("A") 939 0 940 >>> UnknownSeq(4, character="N").count("AA") 941 0 942 943 HOWEVER, please note because that python strings and Seq objects (and 944 MutableSeq objects) do a non-overlapping search, this may not give 945 the answer you expect: 946 947 >>> UnknownSeq(4, character="N").count("NN") 948 2 949 >>> UnknownSeq(4, character="N").count("NNN") 950 1 951 """ 952 sub_str = self._get_seq_str_and_check_alphabet(sub) 953 if len(sub_str) == 1 : 954 if str(sub_str) == self._character : 955 if start==0 and end >= self._length : 956 return self._length 957 else : 958 #This could be done more cleverly... 959 return str(self).count(sub_str, start, end) 960 else : 961 return 0 962 else : 963 if set(sub_str) == set(self._character) : 964 if start==0 and end >= self._length : 965 return self._length // len(sub_str) 966 else : 967 #This could be done more cleverly... 968 return str(self).count(sub_str, start, end) 969 else : 970 return 0
971
972 - def complement(self) :
973 """The complement of an unknown nucleotide equals itself. 974 975 >>> my_nuc = UnknownSeq(8) 976 >>> my_nuc 977 UnknownSeq(8, alphabet = Alphabet(), character = '?') 978 >>> print my_nuc 979 ???????? 980 >>> my_nuc.complement() 981 UnknownSeq(8, alphabet = Alphabet(), character = '?') 982 >>> print my_nuc.complement() 983 ???????? 984 """ 985 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 986 Alphabet.ProteinAlphabet) : 987 raise ValueError("Proteins do not have complements!") 988 return self
989
990 - def reverse_complement(self) :
991 """The reverse complement of an unknown nucleotide equals itself. 992 993 >>> my_nuc = UnknownSeq(10) 994 >>> my_nuc 995 UnknownSeq(10, alphabet = Alphabet(), character = '?') 996 >>> print my_nuc 997 ?????????? 998 >>> my_nuc.reverse_complement() 999 UnknownSeq(10, alphabet = Alphabet(), character = '?') 1000 >>> print my_nuc.reverse_complement() 1001 ?????????? 1002 """ 1003 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1004 Alphabet.ProteinAlphabet) : 1005 raise ValueError("Proteins do not have complements!") 1006 return self
1007
1008 - def transcribe(self) :
1009 """Returns unknown RNA sequence from an unknown DNA sequence. 1010 1011 >>> my_dna = UnknownSeq(10, character="N") 1012 >>> my_dna 1013 UnknownSeq(10, alphabet = Alphabet(), character = 'N') 1014 >>> print my_dna 1015 NNNNNNNNNN 1016 >>> my_rna = my_dna.transcribe() 1017 >>> my_rna 1018 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N') 1019 >>> print my_rna 1020 NNNNNNNNNN 1021 """ 1022 #Offload the alphabet stuff 1023 s = Seq(self._character, self.alphabet).transcribe() 1024 return UnknownSeq(self._length, s.alphabet, self._character)
1025
1026 - def back_transcribe(self) :
1027 """Returns unknown DNA sequence from an unknown RNA sequence. 1028 1029 >>> my_rna = UnknownSeq(20, character="N") 1030 >>> my_rna 1031 UnknownSeq(20, alphabet = Alphabet(), character = 'N') 1032 >>> print my_rna 1033 NNNNNNNNNNNNNNNNNNNN 1034 >>> my_dna = my_rna.back_transcribe() 1035 >>> my_dna 1036 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N') 1037 >>> print my_dna 1038 NNNNNNNNNNNNNNNNNNNN 1039 """ 1040 #Offload the alphabet stuff 1041 s = Seq(self._character, self.alphabet).back_transcribe() 1042 return UnknownSeq(self._length, s.alphabet, self._character)
1043
1044 - def translate(self, **kwargs) :
1045 """Translate an unknown nucleotide sequence into an unknown protein. 1046 1047 e.g. 1048 1049 >>> my_seq = UnknownSeq(11, character="N") 1050 >>> print my_seq 1051 NNNNNNNNNNN 1052 >>> my_protein = my_seq.translate() 1053 >>> my_protein 1054 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X') 1055 >>> print my_protein 1056 XXX 1057 1058 In comparison, using a normal Seq object: 1059 1060 >>> my_seq = Seq("NNNNNNNNNNN") 1061 >>> print my_seq 1062 NNNNNNNNNNN 1063 >>> my_protein = my_seq.translate() 1064 >>> my_protein 1065 Seq('XXX', ExtendedIUPACProtein()) 1066 >>> print my_protein 1067 XXX 1068 1069 """ 1070 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1071 Alphabet.ProteinAlphabet) : 1072 raise ValueError("Proteins cannot be translated!") 1073 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1074 1075
1076 -class MutableSeq(object):
1077 """An editable sequence object (with an alphabet). 1078 1079 Unlike normal python strings and our basic sequence object (the Seq class) 1080 which are immuatable, the MutableSeq lets you edit the sequence in place. 1081 However, this means you cannot use a MutableSeq object as a dictionary key. 1082 1083 >>> from Bio.Seq import MutableSeq 1084 >>> from Bio.Alphabet import generic_dna 1085 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 1086 >>> my_seq 1087 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 1088 >>> my_seq[5] 1089 'T' 1090 >>> my_seq[5] = "A" 1091 >>> my_seq 1092 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 1093 >>> my_seq[5] 1094 'A' 1095 >>> my_seq[5:8] = "NNN" 1096 >>> my_seq 1097 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 1098 >>> len(my_seq) 1099 11 1100 1101 Note that the MutableSeq object does not support as many string-like 1102 or biological methods as the Seq object. 1103 """
1104 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
1105 if type(data) == type(""): 1106 self.data = array.array("c", data) 1107 else: 1108 self.data = data # assumes the input is an array 1109 self.alphabet = alphabet
1110
1111 - def __repr__(self):
1112 """Returns a (truncated) representation of the sequence for debugging.""" 1113 if len(self) > 60 : 1114 #Shows the last three letters as it is often useful to see if there 1115 #is a stop codon at the end of a sequence. 1116 #Note total length is 54+3+3=60 1117 return "%s('%s...%s', %s)" % (self.__class__.__name__, 1118 str(self[:54]), str(self[-3:]), 1119 repr(self.alphabet)) 1120 else : 1121 return "%s('%s', %s)" % (self.__class__.__name__, 1122 str(self), 1123 repr(self.alphabet))
1124
1125 - def __str__(self):
1126 """Returns the full sequence as a python string. 1127 1128 Note that Biopython 1.44 and earlier would give a truncated 1129 version of repr(my_seq) for str(my_seq). If you are writing code 1130 which needs to be backwards compatible with old Biopython, you 1131 should continue to use my_seq.tostring() rather than str(my_seq). 1132 """ 1133 #See test_GAQueens.py for an historic usage of a non-string alphabet! 1134 return "".join(self.data)
1135
1136 - def __cmp__(self, other):
1137 """Compare the sequence for to another sequence or a string. 1138 1139 If compared to another sequence the alphabets must be compatible. 1140 Comparing DNA to RNA, or Nucleotide to Protein will raise an 1141 exception. 1142 1143 Otherwise only the sequence itself is compared, not the precise 1144 alphabet. 1145 1146 This method indirectly supports ==, < , etc.""" 1147 if hasattr(other, "alphabet") : 1148 #other should be a Seq or a MutableSeq 1149 if not Alphabet._check_type_compatible([self.alphabet, 1150 other.alphabet]) : 1151 raise TypeError("Incompatable alphabets %s and %s" \ 1152 % (repr(self.alphabet), repr(other.alphabet))) 1153 #They should be the same sequence type (or one of them is generic) 1154 if isinstance(other, MutableSeq): 1155 #See test_GAQueens.py for an historic usage of a non-string 1156 #alphabet! Comparing the arrays supports this. 1157 return cmp(self.data, other.data) 1158 else : 1159 return cmp(str(self), str(other)) 1160 elif isinstance(other, basestring) : 1161 return cmp(str(self), other) 1162 else : 1163 raise TypeError
1164
1165 - def __len__(self): return len(self.data)
1166
1167 - def __getitem__(self, index) :
1168 #Note since Python 2.0, __getslice__ is deprecated 1169 #and __getitem__ is used instead. 1170 #See http://docs.python.org/ref/sequence-methods.html 1171 if isinstance(index, int) : 1172 #Return a single letter as a string 1173 return self.data[index] 1174 else : 1175 #Return the (sub)sequence as another Seq object 1176 return MutableSeq(self.data[index], self.alphabet)
1177
1178 - def __setitem__(self, index, value):
1179 #Note since Python 2.0, __setslice__ is deprecated 1180 #and __setitem__ is used instead. 1181 #See http://docs.python.org/ref/sequence-methods.html 1182 if isinstance(index, int) : 1183 #Replacing a single letter with a new string 1184 self.data[index] = value 1185 else : 1186 #Replacing a sub-sequence 1187 if isinstance(value, MutableSeq): 1188 self.data[index] = value.data 1189 elif isinstance(value, type(self.data)): 1190 self.data[index] = value 1191 else: 1192 self.data[index] = array.array("c", str(value))
1193
1194 - def __delitem__(self, index):
1195 #Note since Python 2.0, __delslice__ is deprecated 1196 #and __delitem__ is used instead. 1197 #See http://docs.python.org/ref/sequence-methods.html 1198 1199 #Could be deleting a single letter, or a slice 1200 del self.data[index]
1201
1202 - def __add__(self, other):
1203 """Add another sequence or string to this sequence. 1204 1205 Returns a new MutableSeq object.""" 1206 if hasattr(other, "alphabet") : 1207 #other should be a Seq or a MutableSeq 1208 if not Alphabet._check_type_compatible([self.alphabet, 1209 other.alphabet]) : 1210 raise TypeError("Incompatable alphabets %s and %s" \ 1211 % (repr(self.alphabet), repr(other.alphabet))) 1212 #They should be the same sequence type (or one of them is generic) 1213 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1214 if isinstance(other, MutableSeq): 1215 #See test_GAQueens.py for an historic usage of a non-string 1216 #alphabet! Adding the arrays should support this. 1217 return self.__class__(self.data + other.data, a) 1218 else : 1219 return self.__class__(str(self) + str(other), a) 1220 elif isinstance(other, basestring) : 1221 #other is a plain string - use the current alphabet 1222 return self.__class__(str(self) + str(other), self.alphabet) 1223 else : 1224 raise TypeError
1225
1226 - def __radd__(self, other):
1227 if hasattr(other, "alphabet") : 1228 #other should be a Seq or a MutableSeq 1229 if not Alphabet._check_type_compatible([self.alphabet, 1230 other.alphabet]) : 1231 raise TypeError("Incompatable alphabets %s and %s" \ 1232 % (repr(self.alphabet), repr(other.alphabet))) 1233 #They should be the same sequence type (or one of them is generic) 1234 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 1235 if isinstance(other, MutableSeq): 1236 #See test_GAQueens.py for an historic usage of a non-string 1237 #alphabet! Adding the arrays should support this. 1238 return self.__class__(other.data + self.data, a) 1239 else : 1240 return self.__class__(str(other) + str(self), a) 1241 elif isinstance(other, basestring) : 1242 #other is a plain string - use the current alphabet 1243 return self.__class__(str(other) + str(self), self.alphabet) 1244 else : 1245 raise TypeError
1246
1247 - def append(self, c):
1248 self.data.append(c)
1249
1250 - def insert(self, i, c):
1251 self.data.insert(i, c)
1252
1253 - def pop(self, i = (-1)):
1254 c = self.data[i] 1255 del self.data[i] 1256 return c
1257
1258 - def remove(self, item):
1259 for i in range(len(self.data)): 1260 if self.data[i] == item: 1261 del self.data[i] 1262 return 1263 raise ValueError("MutableSeq.remove(x): x not in list")
1264
1265 - def count(self, sub, start=0, end=sys.maxint):
1266 """Non-overlapping count method, like that of a python string. 1267 1268 This behaves like the python string method of the same name, 1269 which does a non-overlapping count! 1270 1271 Returns an integer, the number of occurrences of substring 1272 argument sub in the (sub)sequence given by [start:end]. 1273 Optional arguments start and end are interpreted as in slice 1274 notation. 1275 1276 Arguments: 1277 - sub - a string or another Seq object to look for 1278 - start - optional integer, slice start 1279 - end - optional integer, slice end 1280 1281 e.g. 1282 1283 >>> from Bio.Seq import MutableSeq 1284 >>> my_mseq = MutableSeq("AAAATGA") 1285 >>> print my_mseq.count("A") 1286 5 1287 >>> print my_mseq.count("ATG") 1288 1 1289 >>> print my_mseq.count(Seq("AT")) 1290 1 1291 >>> print my_mseq.count("AT", 2, -1) 1292 1 1293 1294 HOWEVER, please note because that python strings, Seq objects and 1295 MutableSeq objects do a non-overlapping search, this may not give 1296 the answer you expect: 1297 1298 >>> "AAAA".count("AA") 1299 2 1300 >>> print MutableSeq("AAAA").count("AA") 1301 2 1302 1303 A non-overlapping search would give the answer as three! 1304 """ 1305 try : 1306 #TODO - Should we check the alphabet? 1307 search = sub.tostring() 1308 except AttributeError : 1309 search = sub 1310 1311 if not isinstance(search, basestring) : 1312 raise TypeError("expected a string, Seq or MutableSeq") 1313 1314 if len(search) == 1 : 1315 #Try and be efficient and work directly from the array. 1316 count = 0 1317 for c in self.data[start:end]: 1318 if c == search: count += 1 1319 return count 1320 else : 1321 #TODO - Can we do this more efficiently? 1322 return self.tostring().count(search, start, end)
1323
1324 - def index(self, item):
1325 for i in range(len(self.data)): 1326 if self.data[i] == item: 1327 return i 1328 raise ValueError("MutableSeq.index(x): x not in list")
1329
1330 - def reverse(self):
1331 """Modify the mutable sequence to reverse itself. 1332 1333 No return value. 1334 """ 1335 self.data.reverse()
1336
1337 - def complement(self):
1338 """Modify the mutable sequence to take on its complement. 1339 1340 Trying to complement a protein sequence raises an exception. 1341 1342 No return value. 1343 """ 1344 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 1345 Alphabet.ProteinAlphabet) : 1346 raise ValueError("Proteins do not have complements!") 1347 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 1348 d = ambiguous_dna_complement 1349 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 1350 d = ambiguous_rna_complement 1351 elif 'U' in self.data and 'T' in self.data : 1352 #TODO - Handle this cleanly? 1353 raise ValueError("Mixed RNA/DNA found") 1354 elif 'U' in self.data: 1355 d = ambiguous_rna_complement 1356 else: 1357 d = ambiguous_dna_complement 1358 c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()]) 1359 d.update(c) 1360 self.data = map(lambda c: d[c], self.data) 1361 self.data = array.array('c', self.data)
1362
1363 - def reverse_complement(self):
1364 """Modify the mutable sequence to take on its reverse complement. 1365 1366 Trying to reverse complement a protein sequence raises an exception. 1367 1368 No return value. 1369 """ 1370 self.complement() 1371 self.data.reverse()
1372 1373 ## Sorting a sequence makes no sense. 1374 # def sort(self, *args): self.data.sort(*args) 1375
1376 - def extend(self, other):
1377 if isinstance(other, MutableSeq): 1378 for c in other.data: 1379 self.data.append(c) 1380 else: 1381 for c in other: 1382 self.data.append(c)
1383
1384 - def tostring(self):
1385 """Returns the full sequence as a python string. 1386 1387 Although not formally deprecated, you are now encouraged to use 1388 str(my_seq) instead of my_seq.tostring(). 1389 1390 Because str(my_seq) will give you the full sequence as a python string, 1391 there is often no need to make an explicit conversion. For example, 1392 1393 print "ID={%s}, sequence={%s}" % (my_name, my_seq) 1394 1395 On Biopython 1.44 or older you would have to have done this: 1396 1397 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring()) 1398 """ 1399 return "".join(self.data)
1400
1401 - def toseq(self):
1402 """Returns the full sequence as a new immutable Seq object. 1403 1404 >>> from Bio.Seq import Seq 1405 >>> from Bio.Alphabet import IUPAC 1406 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \ 1407 IUPAC.protein) 1408 >>> my_mseq 1409 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1410 >>> my_mseq.toseq() 1411 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 1412 1413 Note that the alphabet is preserved. 1414 """ 1415 return Seq("".join(self.data), self.alphabet)
1416 1417 # The transcribe, backward_transcribe, and translate functions are 1418 # user-friendly versions of the corresponding functions in Bio.Transcribe 1419 # and Bio.Translate. The functions work both on Seq objects, and on strings. 1420
1421 -def transcribe(dna):
1422 """Transcribes a DNA sequence into RNA. 1423 1424 If given a string, returns a new string object. 1425 1426 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1427 1428 Trying to transcribe a protein or RNA sequence raises an exception. 1429 1430 e.g. 1431 1432 >>> transcribe("ACTGN") 1433 'ACUGN' 1434 """ 1435 if isinstance(dna, Seq) : 1436 return dna.transcribe() 1437 elif isinstance(dna, MutableSeq): 1438 return dna.toseq().transcribe() 1439 else: 1440 return dna.replace('T','U').replace('t','u')
1441
1442 -def back_transcribe(rna):
1443 """Back-transcribes an RNA sequence into DNA. 1444 1445 If given a string, returns a new string object. 1446 1447 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1448 1449 Trying to transcribe a protein or DNA sequence raises an exception. 1450 1451 e.g. 1452 1453 >>> back_transcribe("ACUGN") 1454 'ACTGN' 1455 """ 1456 if isinstance(rna, Seq) : 1457 return rna.back_transcribe() 1458 elif isinstance(rna, MutableSeq): 1459 return rna.toseq().back_transcribe() 1460 else: 1461 return rna.replace('U','T').replace('u','t')
1462
1463 -def _translate_str(sequence, table, stop_symbol="*", 1464 to_stop=False, pos_stop="X") :
1465 """Helper function to translate a nucleotide string (PRIVATE). 1466 1467 Arguments: 1468 - sequence - a string 1469 - table - a CodonTable object (NOT a table name or id number) 1470 - stop_symbol - a single character string, what to use for terminators. 1471 - to_stop - boolean, should translation terminate at the first 1472 in frame stop codon? If there is no in-frame stop codon 1473 then translation continues to the end. 1474 - pos_stop - a single character string for a possible stop codon 1475 (e.g. TAN or NNN) 1476 1477 Returns a string. 1478 1479 e.g. 1480 1481 >>> from Bio.Data import CodonTable 1482 >>> table = CodonTable.ambiguous_dna_by_id[1] 1483 >>> _translate_str("AAA", table) 1484 'K' 1485 >>> _translate_str("TAR", table) 1486 '*' 1487 >>> _translate_str("TAN", table) 1488 'X' 1489 >>> _translate_str("TAN", table, pos_stop="@") 1490 '@' 1491 >>> _translate_str("TA?", table) 1492 Traceback (most recent call last): 1493 ... 1494 TranslationError: Codon 'TA?' is invalid 1495 """ 1496 sequence = sequence.upper() 1497 amino_acids = [] 1498 forward_table = table.forward_table 1499 stop_codons = table.stop_codons 1500 if table.nucleotide_alphabet.letters is not None : 1501 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 1502 else : 1503 #Assume the worst case, ambiguous DNA or RNA: 1504 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \ 1505 IUPAC.ambiguous_rna.letters.upper()) 1506 1507 n = len(sequence) 1508 for i in xrange(0,n-n%3,3) : 1509 codon = sequence[i:i+3] 1510 try : 1511 amino_acids.append(forward_table[codon]) 1512 except (KeyError, CodonTable.TranslationError) : 1513 #Todo? Treat "---" as a special case (gapped translation) 1514 if codon in table.stop_codons : 1515 if to_stop : break 1516 amino_acids.append(stop_symbol) 1517 elif valid_letters.issuperset(set(codon)) : 1518 #Possible stop codon (e.g. NNN or TAN) 1519 amino_acids.append(pos_stop) 1520 else : 1521 raise CodonTable.TranslationError(\ 1522 "Codon '%s' is invalid" % codon) 1523 return "".join(amino_acids)
1524
1525 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False):
1526 """Translate a nucleotide sequence into amino acids. 1527 1528 If given a string, returns a new string object. Given a Seq or 1529 MutableSeq, returns a Seq object with a protein alphabet. 1530 1531 Arguments: 1532 - table - Which codon table to use? This can be either a name 1533 (string) or an NCBI identifier (integer). Defaults 1534 to the "Standard" table. 1535 - stop_symbol - Single character string, what to use for any 1536 terminators, defaults to the asterisk, "*". 1537 - to_stop - Boolean, defaults to False meaning do a full 1538 translation continuing on past any stop codons 1539 (translated as the specified stop_symbol). If 1540 True, translation is terminated at the first in 1541 frame stop codon (and the stop_symbol is not 1542 appended to the returned protein sequence). 1543 1544 A simple string example using the default (standard) genetic code: 1545 1546 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 1547 >>> translate(coding_dna) 1548 'VAIVMGR*KGAR*' 1549 >>> translate(coding_dna, stop_symbol="@") 1550 'VAIVMGR@KGAR@' 1551 >>> translate(coding_dna, to_stop=True) 1552 'VAIVMGR' 1553 1554 Now using NCBI table 2, where TGA is not a stop codon: 1555 1556 >>> translate(coding_dna, table=2) 1557 'VAIVMGRWKGAR*' 1558 >>> translate(coding_dna, table=2, to_stop=True) 1559 'VAIVMGRWKGAR' 1560 1561 Note that if the sequence has no in-frame stop codon, then the to_stop 1562 argument has no effect: 1563 1564 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 1565 >>> translate(coding_dna2) 1566 'VAIVMGR' 1567 >>> translate(coding_dna2, to_stop=True) 1568 'VAIVMGR' 1569 1570 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 1571 or a stop codon. These are translated as "X". Any invalid codon 1572 (e.g. "TA?" or "T-A") will throw a TranslationError. 1573 1574 NOTE - Does NOT support gapped sequences. 1575 1576 It will however translate either DNA or RNA. 1577 """ 1578 if isinstance(sequence, Seq) : 1579 return sequence.translate(table, stop_symbol, to_stop) 1580 elif isinstance(sequence, MutableSeq): 1581 #Return a Seq object 1582 return sequence.toseq().translate(table, stop_symbol, to_stop) 1583 else: 1584 #Assume its a string, return a string 1585 try : 1586 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 1587 except ValueError : 1588 codon_table = CodonTable.ambiguous_generic_by_name[table] 1589 return _translate_str(sequence, codon_table, stop_symbol, to_stop)
1590
1591 -def reverse_complement(sequence):
1592 """Returns the reverse complement sequence of a nucleotide string. 1593 1594 If given a string, returns a new string object. 1595 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 1596 1597 Supports unambiguous and ambiguous nucleotide sequences. 1598 1599 e.g. 1600 1601 >>> reverse_complement("ACTG-NH") 1602 'DN-CAGT' 1603 """ 1604 if isinstance(sequence, Seq) : 1605 #Return a Seq 1606 return sequence.reverse_complement() 1607 elif isinstance(sequence, MutableSeq) : 1608 #Return a Seq 1609 #Don't use the MutableSeq reverse_complement method as it is 'in place'. 1610 return sequence.toseq().reverse_complement() 1611 1612 #Assume its a string. 1613 #In order to avoid some code duplication, the old code would turn the string 1614 #into a Seq, use the reverse_complement method, and convert back to a string. 1615 #This worked, but is over five times slower on short sequences! 1616 if ('U' in sequence or 'u' in sequence) \ 1617 and ('T' in sequence or 't' in sequence): 1618 raise ValueError("Mixed RNA/DNA found") 1619 elif 'U' in sequence or 'u' in sequence: 1620 ttable = _rna_complement_table 1621 else: 1622 ttable = _dna_complement_table 1623 return sequence.translate(ttable)[::-1]
1624
1625 -def _test():
1626 """Run the Bio.Seq module's doctests.""" 1627 print "Runing doctests..." 1628 import doctest 1629 doctest.testmod() 1630 print "Done"
1631 1632 if __name__ == "__main__": 1633 _test() 1634