1
2
3
4
5
6
7
8 """Provides objects to represent biological sequences with alphabets.
9
10 See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial:
11 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
12 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
13 """
14 __docformat__ ="epytext en"
15
16 import string
17 import array
18 import sys
19
20 import Alphabet
21 from Alphabet import IUPAC
22 from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
23 from Bio.Data import CodonTable
24
26 """Makes a python string translation table (PRIVATE).
27
28 Arguments:
29 - complement_mapping - a dictionary such as ambiguous_dna_complement
30 and ambiguous_rna_complement from Data.IUPACData.
31
32 Returns a translation table (a string of length 256) for use with the
33 python string's translate method to use in a (reverse) complement.
34
35 Compatible with lower case and upper case sequences.
36
37 For internal use only.
38 """
39 before = ''.join(complement_mapping.keys())
40 after = ''.join(complement_mapping.values())
41 before = before + before.lower()
42 after = after + after.lower()
43 return string.maketrans(before, after)
44
45 _dna_complement_table = _maketrans(ambiguous_dna_complement)
46 _rna_complement_table = _maketrans(ambiguous_rna_complement)
47
49 """A read-only sequence object (essentially a string with an alphabet).
50
51 Like normal python strings, our basic sequence object is immutable.
52 This prevents you from doing my_seq[5] = "A" for example, but does allow
53 Seq objects to be used as dictionary keys.
54
55 The Seq object provides a number of string like methods (such as count,
56 find, split and strip), which are alphabet aware where appropriate.
57
58 The Seq object also provides some biological methods, such as complement,
59 reverse_complement, transcribe, back_transcribe and translate (which are
60 not applicable to sequences with a protein alphabet).
61 """
63 """Create a Seq object.
64
65 Arguments:
66 - seq - Sequence, required (string)
67 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet
68
69 You will typically use Bio.SeqIO to read in sequences from files as
70 SeqRecord objects, whose sequence will be exposed as a Seq object via
71 the seq property.
72
73 However, will often want to create your own Seq objects directly:
74
75 >>> from Bio.Seq import Seq
76 >>> from Bio.Alphabet import IUPAC
77 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
78 ... IUPAC.protein)
79 >>> my_seq
80 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
81 >>> print my_seq
82 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
83 """
84
85 assert (type(data) == type("") or
86 type(data) == type(u""))
87 self._data = data
88 self.alphabet = alphabet
89
90
92
93
94 import warnings
95 warnings.warn("Writing to the Seq object's .data propery is deprecated.",
96 DeprecationWarning)
97 self._data = value
98 data = property(fget= lambda self : str(self),
99 fset=_set_data,
100 doc="Sequence as a string (DEPRECATED)")
101
103 """Returns a (truncated) representation of the sequence for debugging."""
104 if len(self) > 60 :
105
106
107
108 return "%s('%s...%s', %s)" % (self.__class__.__name__,
109 str(self)[:54], str(self)[-3:],
110 repr(self.alphabet))
111 else :
112 return "%s(%s, %s)" % (self.__class__.__name__,
113 repr(self.data),
114 repr(self.alphabet))
116 """Returns the full sequence as a python string.
117
118 Note that Biopython 1.44 and earlier would give a truncated
119 version of repr(my_seq) for str(my_seq). If you are writing code
120 which need to be backwards compatible with old Biopython, you
121 should continue to use my_seq.tostring() rather than str(my_seq).
122 """
123 return self._data
124
125
126
127
128
129 - def __len__(self): return len(self._data)
130
132
133
134
135 if isinstance(index, int) :
136
137 return self._data[index]
138 else :
139
140 return Seq(self._data[index], self.alphabet)
141
158
174
176 """Returns the full sequence as a python string.
177
178 Although not formally deprecated, you are now encouraged to use
179 str(my_seq) instead of my_seq.tostring()."""
180 return str(self)
181
183 """Returns the full sequence as a MutableSeq object.
184
185 >>> from Bio.Seq import Seq
186 >>> from Bio.Alphabet import IUPAC
187 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL",
188 ... IUPAC.protein)
189 >>> my_seq
190 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
191 >>> my_seq.tomutable()
192 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
193
194 Note that the alphabet is preserved.
195 """
196 return MutableSeq(str(self), self.alphabet)
197
199 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
200
201 For a string argument, returns the string.
202
203 For a Seq or MutableSeq, it checks the alphabet is compatible
204 (raising an exception if it isn't), and then returns a string.
205 """
206 try :
207 other_alpha = other_sequence.alphabet
208 except AttributeError :
209
210 return other_sequence
211
212
213 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]) :
214 raise TypeError("Incompatable alphabets %s and %s" \
215 % (repr(self.alphabet), repr(other_alpha)))
216
217 return str(other_sequence)
218
219 - def count(self, sub, start=0, end=sys.maxint):
220 """Non-overlapping count method, like that of a python string.
221
222 This behaves like the python string method of the same name,
223 which does a non-overlapping count!
224
225 Returns an integer, the number of occurrences of substring
226 argument sub in the (sub)sequence given by [start:end].
227 Optional arguments start and end are interpreted as in slice
228 notation.
229
230 Arguments:
231 - sub - a string or another Seq object to look for
232 - start - optional integer, slice start
233 - end - optional integer, slice end
234
235 e.g.
236
237 >>> from Bio.Seq import Seq
238 >>> my_seq = Seq("AAAATGA")
239 >>> print my_seq.count("A")
240 5
241 >>> print my_seq.count("ATG")
242 1
243 >>> print my_seq.count(Seq("AT"))
244 1
245 >>> print my_seq.count("AT", 2, -1)
246 1
247
248 HOWEVER, please note because python strings and Seq objects (and
249 MutableSeq objects) do a non-overlapping search, this may not give
250 the answer you expect:
251
252 >>> "AAAA".count("AA")
253 2
254 >>> print Seq("AAAA").count("AA")
255 2
256
257 A non-overlapping search would give the answer as three!
258 """
259
260 sub_str = self._get_seq_str_and_check_alphabet(sub)
261 return str(self).count(sub_str, start, end)
262
264 """Implements the 'in' keyword, like a python string.
265
266 e.g.
267
268 >>> from Bio.Seq import Seq
269 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein
270 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna)
271 >>> "AAA" in my_dna
272 True
273 >>> Seq("AAA") in my_dna
274 True
275 >>> Seq("AAA", generic_dna) in my_dna
276 True
277
278 Like other Seq methods, this will raise a type error if another Seq
279 (or Seq like) object with an incompatible alphabet is used:
280
281 >>> Seq("AAA", generic_rna) in my_dna
282 Traceback (most recent call last):
283 ...
284 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet()
285 >>> Seq("AAA", generic_protein) in my_dna
286 Traceback (most recent call last):
287 ...
288 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet()
289 """
290
291 sub_str = self._get_seq_str_and_check_alphabet(char)
292 return sub_str in str(self)
293
294 - def find(self, sub, start=0, end=sys.maxint):
295 """Find method, like that of a python string.
296
297 This behaves like the python string method of the same name.
298
299 Returns an integer, the index of the first occurrence of substring
300 argument sub in the (sub)sequence given by [start:end].
301
302 Arguments:
303 - sub - a string or another Seq object to look for
304 - start - optional integer, slice start
305 - end - optional integer, slice end
306
307 Returns -1 if the subsequence is NOT found.
308
309 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
310
311 >>> from Bio.Seq import Seq
312 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
313 >>> my_rna.find("AUG")
314 3
315 """
316
317 sub_str = self._get_seq_str_and_check_alphabet(sub)
318 return str(self).find(sub_str, start, end)
319
320 - def rfind(self, sub, start=0, end=sys.maxint):
321 """Find from right method, like that of a python string.
322
323 This behaves like the python string method of the same name.
324
325 Returns an integer, the index of the last (right most) occurrence of
326 substring argument sub in the (sub)sequence given by [start:end].
327
328 Arguments:
329 - sub - a string or another Seq object to look for
330 - start - optional integer, slice start
331 - end - optional integer, slice end
332
333 Returns -1 if the subsequence is NOT found.
334
335 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
336
337 >>> from Bio.Seq import Seq
338 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
339 >>> my_rna.rfind("AUG")
340 15
341 """
342
343 sub_str = self._get_seq_str_and_check_alphabet(sub)
344 return str(self).rfind(sub_str, start, end)
345
346 - def startswith(self, prefix, start=0, end=sys.maxint) :
347 """Does the Seq start with the given prefix? Returns True/False.
348
349 This behaves like the python string method of the same name.
350
351 Return True if the sequence starts with the specified prefix
352 (a string or another Seq object), False otherwise.
353 With optional start, test sequence beginning at that position.
354 With optional end, stop comparing sequence at that position.
355 prefix can also be a tuple of strings to try. e.g.
356
357 >>> from Bio.Seq import Seq
358 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
359 >>> my_rna.startswith("GUC")
360 True
361 >>> my_rna.startswith("AUG")
362 False
363 >>> my_rna.startswith("AUG", 3)
364 True
365 >>> my_rna.startswith(("UCC","UCA","UCG"),1)
366 True
367 """
368
369 if isinstance(prefix, tuple) :
370
371
372
373 prefix_strings = [self._get_seq_str_and_check_alphabet(p) \
374 for p in prefix]
375 for prefix_str in prefix_strings :
376 if str(self).startswith(prefix_str, start, end) :
377 return True
378 return False
379 else :
380 prefix_str = self._get_seq_str_and_check_alphabet(prefix)
381 return str(self).startswith(prefix_str, start, end)
382
383 - def endswith(self, suffix, start=0, end=sys.maxint) :
384 """Does the Seq end with the given suffix? Returns True/False.
385
386 This behaves like the python string method of the same name.
387
388 Return True if the sequence ends with the specified suffix
389 (a string or another Seq object), False otherwise.
390 With optional start, test sequence beginning at that position.
391 With optional end, stop comparing sequence at that position.
392 suffix can also be a tuple of strings to try. e.g.
393
394 >>> from Bio.Seq import Seq
395 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
396 >>> my_rna.endswith("UUG")
397 True
398 >>> my_rna.endswith("AUG")
399 False
400 >>> my_rna.endswith("AUG", 0, 18)
401 True
402 >>> my_rna.endswith(("UCC","UCA","UUG"))
403 True
404 """
405
406 if isinstance(suffix, tuple) :
407
408
409
410 suffix_strings = [self._get_seq_str_and_check_alphabet(p) \
411 for p in suffix]
412 for suffix_str in suffix_strings :
413 if str(self).endswith(suffix_str, start, end) :
414 return True
415 return False
416 else :
417 suffix_str = self._get_seq_str_and_check_alphabet(suffix)
418 return str(self).endswith(suffix_str, start, end)
419
420
421 - def split(self, sep=None, maxsplit=-1) :
422 """Split method, like that of a python string.
423
424 This behaves like the python string method of the same name.
425
426 Return a list of the 'words' in the string (as Seq objects),
427 using sep as the delimiter string. If maxsplit is given, at
428 most maxsplit splits are done. If maxsplit is ommited, all
429 splits are made.
430
431 Following the python string method, sep will by default be any
432 white space (tabs, spaces, newlines) but this is unlikely to
433 apply to biological sequences.
434
435 e.g.
436
437 >>> from Bio.Seq import Seq
438 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
439 >>> my_aa = my_rna.translate()
440 >>> my_aa
441 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
442 >>> my_aa.split("*")
443 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
444 >>> my_aa.split("*",1)
445 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
446
447 See also the rsplit method:
448
449 >>> my_aa.rsplit("*",1)
450 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
451 """
452
453 sep_str = self._get_seq_str_and_check_alphabet(sep)
454
455
456 return [Seq(part, self.alphabet) \
457 for part in str(self).split(sep_str, maxsplit)]
458
459 - def rsplit(self, sep=None, maxsplit=-1) :
460 """Right split method, like that of a python string.
461
462 This behaves like the python string method of the same name.
463
464 Return a list of the 'words' in the string (as Seq objects),
465 using sep as the delimiter string. If maxsplit is given, at
466 most maxsplit splits are done COUNTING FROM THE RIGHT.
467 If maxsplit is ommited, all splits are made.
468
469 Following the python string method, sep will by default be any
470 white space (tabs, spaces, newlines) but this is unlikely to
471 apply to biological sequences.
472
473 e.g. print my_seq.rsplit("*",1)
474
475 See also the split method.
476 """
477
478 sep_str = self._get_seq_str_and_check_alphabet(sep)
479 return [Seq(part, self.alphabet) \
480 for part in str(self).rsplit(sep_str, maxsplit)]
481
482 - def strip(self, chars=None) :
483 """Returns a new Seq object with leading and trailing ends stripped.
484
485 This behaves like the python string method of the same name.
486
487 Optional argument chars defines which characters to remove. If
488 ommitted or None (default) then as for the python string method,
489 this defaults to removing any white space.
490
491 e.g. print my_seq.strip("-")
492
493 See also the lstrip and rstrip methods.
494 """
495
496 strip_str = self._get_seq_str_and_check_alphabet(chars)
497 return Seq(str(self).strip(strip_str), self.alphabet)
498
499 - def lstrip(self, chars=None) :
500 """Returns a new Seq object with leading (left) end stripped.
501
502 This behaves like the python string method of the same name.
503
504 Optional argument chars defines which characters to remove. If
505 ommitted or None (default) then as for the python string method,
506 this defaults to removing any white space.
507
508 e.g. print my_seq.lstrip("-")
509
510 See also the strip and rstrip methods.
511 """
512
513 strip_str = self._get_seq_str_and_check_alphabet(chars)
514 return Seq(str(self).lstrip(strip_str), self.alphabet)
515
516 - def rstrip(self, chars=None) :
517 """Returns a new Seq object with trailing (right) end stripped.
518
519 This behaves like the python string method of the same name.
520
521 Optional argument chars defines which characters to remove. If
522 ommitted or None (default) then as for the python string method,
523 this defaults to removing any white space.
524
525 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
526
527 >>> from Bio.Alphabet import IUPAC
528 >>> from Bio.Seq import Seq
529 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
530 >>> my_seq
531 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
532 >>> my_seq.rstrip("A")
533 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
534
535 See also the strip and lstrip methods.
536 """
537
538 strip_str = self._get_seq_str_and_check_alphabet(chars)
539 return Seq(str(self).rstrip(strip_str), self.alphabet)
540
542 """Returns the complement sequence. New Seq object.
543
544 >>> from Bio.Seq import Seq
545 >>> from Bio.Alphabet import IUPAC
546 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
547 >>> my_dna
548 Seq('CCCCCGATAG', IUPACUnambiguousDNA())
549 >>> my_dna.complement()
550 Seq('GGGGGCTATC', IUPACUnambiguousDNA())
551
552 You can of course used mixed case sequences,
553
554 >>> from Bio.Seq import Seq
555 >>> from Bio.Alphabet import generic_dna
556 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna)
557 >>> my_dna
558 Seq('CCCCCgatA-GD', DNAAlphabet())
559 >>> my_dna.complement()
560 Seq('GGGGGctaT-CH', DNAAlphabet())
561
562 Note in the above example, ambiguous character D denotes
563 G, A or T so its complement is H (for C, T or A).
564
565 Trying to complement a protein sequence raises an exception.
566
567 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
568 >>> my_protein.complement()
569 Traceback (most recent call last):
570 ...
571 ValueError: Proteins do not have complements!
572 """
573 base = Alphabet._get_base_alphabet(self.alphabet)
574 if isinstance(base, Alphabet.ProteinAlphabet) :
575 raise ValueError("Proteins do not have complements!")
576 if isinstance(base, Alphabet.DNAAlphabet) :
577 ttable = _dna_complement_table
578 elif isinstance(base, Alphabet.RNAAlphabet) :
579 ttable = _rna_complement_table
580 elif ('U' in self._data or 'u' in self._data) \
581 and ('T' in self._data or 't' in self._data):
582
583 raise ValueError("Mixed RNA/DNA found")
584 elif 'U' in self._data or 'u' in self._data:
585 ttable = _rna_complement_table
586 else:
587 ttable = _dna_complement_table
588
589
590 return Seq(str(self).translate(ttable), self.alphabet)
591
593 """Returns the reverse complement sequence. New Seq object.
594
595 >>> from Bio.Seq import Seq
596 >>> from Bio.Alphabet import IUPAC
597 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna)
598 >>> my_dna
599 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA())
600 >>> my_dna.reverse_complement()
601 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA())
602
603 Note in the above example, since R = G or A, its complement
604 is Y (which denotes C or T).
605
606 You can of course used mixed case sequences,
607
608 >>> from Bio.Seq import Seq
609 >>> from Bio.Alphabet import generic_dna
610 >>> my_dna = Seq("CCCCCgatA-G", generic_dna)
611 >>> my_dna
612 Seq('CCCCCgatA-G', DNAAlphabet())
613 >>> my_dna.reverse_complement()
614 Seq('C-TatcGGGGG', DNAAlphabet())
615
616 Trying to complement a protein sequence raises an exception:
617
618 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
619 >>> my_protein.reverse_complement()
620 Traceback (most recent call last):
621 ...
622 ValueError: Proteins do not have complements!
623 """
624
625 return self.complement()[::-1]
626
628 """Returns the RNA sequence from a DNA sequence. New Seq object.
629
630 >>> from Bio.Seq import Seq
631 >>> from Bio.Alphabet import IUPAC
632 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
633 ... IUPAC.unambiguous_dna)
634 >>> coding_dna
635 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
636 >>> coding_dna.transcribe()
637 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
638
639 Trying to transcribe a protein or RNA sequence raises an exception:
640
641 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
642 >>> my_protein.transcribe()
643 Traceback (most recent call last):
644 ...
645 ValueError: Proteins cannot be transcribed!
646 """
647 base = Alphabet._get_base_alphabet(self.alphabet)
648 if isinstance(base, Alphabet.ProteinAlphabet) :
649 raise ValueError("Proteins cannot be transcribed!")
650 if isinstance(base, Alphabet.RNAAlphabet) :
651 raise ValueError("RNA cannot be transcribed!")
652
653 if self.alphabet==IUPAC.unambiguous_dna:
654 alphabet = IUPAC.unambiguous_rna
655 elif self.alphabet==IUPAC.ambiguous_dna:
656 alphabet = IUPAC.ambiguous_rna
657 else:
658 alphabet = Alphabet.generic_rna
659 return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
660
662 """Returns the DNA sequence from an RNA sequence. New Seq object.
663
664 >>> from Bio.Seq import Seq
665 >>> from Bio.Alphabet import IUPAC
666 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
667 ... IUPAC.unambiguous_rna)
668 >>> messenger_rna
669 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
670 >>> messenger_rna.back_transcribe()
671 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
672
673 Trying to back-transcribe a protein or DNA sequence raises an
674 exception:
675
676 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
677 >>> my_protein.back_transcribe()
678 Traceback (most recent call last):
679 ...
680 ValueError: Proteins cannot be back transcribed!
681 """
682 base = Alphabet._get_base_alphabet(self.alphabet)
683 if isinstance(base, Alphabet.ProteinAlphabet) :
684 raise ValueError("Proteins cannot be back transcribed!")
685 if isinstance(base, Alphabet.DNAAlphabet) :
686 raise ValueError("DNA cannot be back transcribed!")
687
688 if self.alphabet==IUPAC.unambiguous_rna:
689 alphabet = IUPAC.unambiguous_dna
690 elif self.alphabet==IUPAC.ambiguous_rna:
691 alphabet = IUPAC.ambiguous_dna
692 else:
693 alphabet = Alphabet.generic_dna
694 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
695
696 - def translate(self, table="Standard", stop_symbol="*", to_stop=False,
697 cds=False):
698 """Turns a nucleotide sequence into a protein sequence. New Seq object.
699
700 This method will translate DNA or RNA sequences, and those with a
701 nucleotide or generic alphabet. Trying to translate a protein
702 sequence raises an exception.
703
704 Arguments:
705 - table - Which codon table to use? This can be either a name
706 (string) or an NCBI identifier (integer). This defaults
707 to the "Standard" table.
708 - stop_symbol - Single character string, what to use for terminators.
709 This defaults to the asterisk, "*".
710 - to_stop - Boolean, defaults to False meaning do a full translation
711 continuing on past any stop codons (translated as the
712 specified stop_symbol). If True, translation is
713 terminated at the first in frame stop codon (and the
714 stop_symbol is not appended to the returned protein
715 sequence).
716 - cds - Boolean, indicates this is a complete CDS. If True,
717 this checks the sequence starts with a valid alternative start
718 codon (which will be translated as methionine, M), that the
719 sequence length is a multiple of three, and that there is a
720 single in frame stop codon at the end (this will be excluded
721 from the protein sequence, regardless of the to_stop option).
722 If these tests fail, an exception is raised.
723
724 e.g. Using the standard table:
725
726 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
727 >>> coding_dna.translate()
728 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
729 >>> coding_dna.translate(stop_symbol="@")
730 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
731 >>> coding_dna.translate(to_stop=True)
732 Seq('VAIVMGR', ExtendedIUPACProtein())
733
734 Now using NCBI table 2, where TGA is not a stop codon:
735
736 >>> coding_dna.translate(table=2)
737 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
738 >>> coding_dna.translate(table=2, to_stop=True)
739 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
740
741 In fact, GTG is an alternative start codon under NCBI table 2, meaning
742 this sequence could be a complete CDS:
743
744 >>> coding_dna.translate(table=2, cds=True)
745 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein())
746
747 It isn't a valid CDS under NCBI table 1, due to both the start codon and
748 also the in frame stop codons:
749
750 >>> coding_dna.translate(table=1, cds=True)
751 Traceback (most recent call last):
752 ...
753 TranslationError: First codon 'GTG' is not a start codon
754
755 If the sequence has no in-frame stop codon, then the to_stop argument
756 has no effect:
757
758 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
759 >>> coding_dna2.translate()
760 Seq('LAIVMGR', ExtendedIUPACProtein())
761 >>> coding_dna2.translate(to_stop=True)
762 Seq('LAIVMGR', ExtendedIUPACProtein())
763
764 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
765 or a stop codon. These are translated as "X". Any invalid codon
766 (e.g. "TA?" or "T-A") will throw a TranslationError.
767
768 NOTE - Does NOT support gapped sequences.
769
770 NOTE - This does NOT behave like the python string's translate
771 method. For that use str(my_seq).translate(...) instead.
772 """
773 try:
774 table_id = int(table)
775 except ValueError:
776 table_id = None
777 if isinstance(table, str) and len(table)==256 :
778 raise ValueError("The Seq object translate method DOES NOT take " \
779 + "a 256 character string mapping table like " \
780 + "the python string object's translate method. " \
781 + "Use str(my_seq).translate(...) instead.")
782 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
783 Alphabet.ProteinAlphabet) :
784 raise ValueError("Proteins cannot be translated!")
785 if self.alphabet==IUPAC.unambiguous_dna:
786
787 if table_id is None:
788 codon_table = CodonTable.unambiguous_dna_by_name[table]
789 else:
790 codon_table = CodonTable.unambiguous_dna_by_id[table_id]
791
792
793
794
795
796 elif self.alphabet==IUPAC.unambiguous_rna:
797
798 if table_id is None:
799 codon_table = CodonTable.unambiguous_rna_by_name[table]
800 else:
801 codon_table = CodonTable.unambiguous_rna_by_id[table_id]
802
803
804
805
806
807 else:
808
809
810
811 if table_id is None:
812 codon_table = CodonTable.ambiguous_generic_by_name[table]
813 else:
814 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
815 protein = _translate_str(str(self), codon_table, \
816 stop_symbol, to_stop, cds)
817 if stop_symbol in protein :
818 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
819 stop_symbol = stop_symbol)
820 else :
821 alphabet = codon_table.protein_alphabet
822 return Seq(protein, alphabet)
823
825 """A read-only sequence object of known length but unknown contents.
826
827 If you have an unknown sequence, you can represent this with a normal
828 Seq object, for example:
829
830 >>> my_seq = Seq("N"*5)
831 >>> my_seq
832 Seq('NNNNN', Alphabet())
833 >>> len(my_seq)
834 5
835 >>> print my_seq
836 NNNNN
837
838 However, this is rather wasteful of memory (especially for large
839 sequences), which is where this class is most usefull:
840
841 >>> unk_five = UnknownSeq(5)
842 >>> unk_five
843 UnknownSeq(5, alphabet = Alphabet(), character = '?')
844 >>> len(unk_five)
845 5
846 >>> print(unk_five)
847 ?????
848
849 You can add unknown sequence together, provided their alphabets and
850 characters are compatible, and get another memory saving UnknownSeq:
851
852 >>> unk_four = UnknownSeq(4)
853 >>> unk_four
854 UnknownSeq(4, alphabet = Alphabet(), character = '?')
855 >>> unk_four + unk_five
856 UnknownSeq(9, alphabet = Alphabet(), character = '?')
857
858 If the alphabet or characters don't match up, the addition gives an
859 ordinary Seq object:
860
861 >>> unk_nnnn = UnknownSeq(4, character = "N")
862 >>> unk_nnnn
863 UnknownSeq(4, alphabet = Alphabet(), character = 'N')
864 >>> unk_nnnn + unk_four
865 Seq('NNNN????', Alphabet())
866
867 Combining with a real Seq gives a new Seq object:
868
869 >>> known_seq = Seq("ACGT")
870 >>> unk_four + known_seq
871 Seq('????ACGT', Alphabet())
872 >>> known_seq + unk_four
873 Seq('ACGT????', Alphabet())
874 """
876 """Create a new UnknownSeq object.
877
878 If character is ommited, it is determed from the alphabet, "N" for
879 nucleotides, "X" for proteins, and "?" otherwise.
880 """
881 self._length = int(length)
882 if self._length < 0 :
883
884 raise ValueError("Length must not be negative.")
885 self.alphabet = alphabet
886 if character :
887 if len(character) != 1 :
888 raise ValueError("character argument should be a single letter string.")
889 self._character = character
890 else :
891 base = Alphabet._get_base_alphabet(alphabet)
892
893
894 if isinstance(base, Alphabet.NucleotideAlphabet) :
895 self._character = "N"
896 elif isinstance(base, Alphabet.ProteinAlphabet) :
897 self._character = "X"
898 else :
899 self._character = "?"
900
902 """Returns the stated length of the unknown sequence."""
903 return self._length
904
906 """Returns the unknown sequence as full string of the given length."""
907 return self._character * self._length
908
910 return "UnknownSeq(%i, alphabet = %s, character = %s)" \
911 % (self._length, repr(self.alphabet), repr(self._character))
912
914 if isinstance(other, UnknownSeq) \
915 and other._character == self._character :
916
917 return UnknownSeq(len(self)+len(other),
918 self.alphabet, self._character)
919
920 return Seq(str(self), self.alphabet) + other
921
923 if isinstance(other, UnknownSeq) \
924 and other._character == self._character :
925
926 return UnknownSeq(len(self)+len(other),
927 self.alphabet, self._character)
928
929 return other + Seq(str(self), self.alphabet)
930
939
940 - def count(self, sub, start=0, end=sys.maxint):
941 """Non-overlapping count method, like that of a python string.
942
943 This behaves like the python string (and Seq object) method of the
944 same name, which does a non-overlapping count!
945
946 Returns an integer, the number of occurrences of substring
947 argument sub in the (sub)sequence given by [start:end].
948 Optional arguments start and end are interpreted as in slice
949 notation.
950
951 Arguments:
952 - sub - a string or another Seq object to look for
953 - start - optional integer, slice start
954 - end - optional integer, slice end
955
956 >>> "NNNN".count("N")
957 4
958 >>> Seq("NNNN").count("N")
959 4
960 >>> UnknownSeq(4, character="N").count("N")
961 4
962 >>> UnknownSeq(4, character="N").count("A")
963 0
964 >>> UnknownSeq(4, character="N").count("AA")
965 0
966
967 HOWEVER, please note because that python strings and Seq objects (and
968 MutableSeq objects) do a non-overlapping search, this may not give
969 the answer you expect:
970
971 >>> UnknownSeq(4, character="N").count("NN")
972 2
973 >>> UnknownSeq(4, character="N").count("NNN")
974 1
975 """
976 sub_str = self._get_seq_str_and_check_alphabet(sub)
977 if len(sub_str) == 1 :
978 if str(sub_str) == self._character :
979 if start==0 and end >= self._length :
980 return self._length
981 else :
982
983 return str(self).count(sub_str, start, end)
984 else :
985 return 0
986 else :
987 if set(sub_str) == set(self._character) :
988 if start==0 and end >= self._length :
989 return self._length // len(sub_str)
990 else :
991
992 return str(self).count(sub_str, start, end)
993 else :
994 return 0
995
997 """The complement of an unknown nucleotide equals itself.
998
999 >>> my_nuc = UnknownSeq(8)
1000 >>> my_nuc
1001 UnknownSeq(8, alphabet = Alphabet(), character = '?')
1002 >>> print my_nuc
1003 ????????
1004 >>> my_nuc.complement()
1005 UnknownSeq(8, alphabet = Alphabet(), character = '?')
1006 >>> print my_nuc.complement()
1007 ????????
1008 """
1009 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1010 Alphabet.ProteinAlphabet) :
1011 raise ValueError("Proteins do not have complements!")
1012 return self
1013
1015 """The reverse complement of an unknown nucleotide equals itself.
1016
1017 >>> my_nuc = UnknownSeq(10)
1018 >>> my_nuc
1019 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1020 >>> print my_nuc
1021 ??????????
1022 >>> my_nuc.reverse_complement()
1023 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1024 >>> print my_nuc.reverse_complement()
1025 ??????????
1026 """
1027 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1028 Alphabet.ProteinAlphabet) :
1029 raise ValueError("Proteins do not have complements!")
1030 return self
1031
1033 """Returns unknown RNA sequence from an unknown DNA sequence.
1034
1035 >>> my_dna = UnknownSeq(10, character="N")
1036 >>> my_dna
1037 UnknownSeq(10, alphabet = Alphabet(), character = 'N')
1038 >>> print my_dna
1039 NNNNNNNNNN
1040 >>> my_rna = my_dna.transcribe()
1041 >>> my_rna
1042 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N')
1043 >>> print my_rna
1044 NNNNNNNNNN
1045 """
1046
1047 s = Seq(self._character, self.alphabet).transcribe()
1048 return UnknownSeq(self._length, s.alphabet, self._character)
1049
1051 """Returns unknown DNA sequence from an unknown RNA sequence.
1052
1053 >>> my_rna = UnknownSeq(20, character="N")
1054 >>> my_rna
1055 UnknownSeq(20, alphabet = Alphabet(), character = 'N')
1056 >>> print my_rna
1057 NNNNNNNNNNNNNNNNNNNN
1058 >>> my_dna = my_rna.back_transcribe()
1059 >>> my_dna
1060 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1061 >>> print my_dna
1062 NNNNNNNNNNNNNNNNNNNN
1063 """
1064
1065 s = Seq(self._character, self.alphabet).back_transcribe()
1066 return UnknownSeq(self._length, s.alphabet, self._character)
1067
1069 """Translate an unknown nucleotide sequence into an unknown protein.
1070
1071 e.g.
1072
1073 >>> my_seq = UnknownSeq(11, character="N")
1074 >>> print my_seq
1075 NNNNNNNNNNN
1076 >>> my_protein = my_seq.translate()
1077 >>> my_protein
1078 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X')
1079 >>> print my_protein
1080 XXX
1081
1082 In comparison, using a normal Seq object:
1083
1084 >>> my_seq = Seq("NNNNNNNNNNN")
1085 >>> print my_seq
1086 NNNNNNNNNNN
1087 >>> my_protein = my_seq.translate()
1088 >>> my_protein
1089 Seq('XXX', ExtendedIUPACProtein())
1090 >>> print my_protein
1091 XXX
1092
1093 """
1094 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1095 Alphabet.ProteinAlphabet) :
1096 raise ValueError("Proteins cannot be translated!")
1097 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1098
1099
1101 """An editable sequence object (with an alphabet).
1102
1103 Unlike normal python strings and our basic sequence object (the Seq class)
1104 which are immuatable, the MutableSeq lets you edit the sequence in place.
1105 However, this means you cannot use a MutableSeq object as a dictionary key.
1106
1107 >>> from Bio.Seq import MutableSeq
1108 >>> from Bio.Alphabet import generic_dna
1109 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
1110 >>> my_seq
1111 MutableSeq('ACTCGTCGTCG', DNAAlphabet())
1112 >>> my_seq[5]
1113 'T'
1114 >>> my_seq[5] = "A"
1115 >>> my_seq
1116 MutableSeq('ACTCGACGTCG', DNAAlphabet())
1117 >>> my_seq[5]
1118 'A'
1119 >>> my_seq[5:8] = "NNN"
1120 >>> my_seq
1121 MutableSeq('ACTCGNNNTCG', DNAAlphabet())
1122 >>> len(my_seq)
1123 11
1124
1125 Note that the MutableSeq object does not support as many string-like
1126 or biological methods as the Seq object.
1127 """
1134
1136 """Returns a (truncated) representation of the sequence for debugging."""
1137 if len(self) > 60 :
1138
1139
1140
1141 return "%s('%s...%s', %s)" % (self.__class__.__name__,
1142 str(self[:54]), str(self[-3:]),
1143 repr(self.alphabet))
1144 else :
1145 return "%s('%s', %s)" % (self.__class__.__name__,
1146 str(self),
1147 repr(self.alphabet))
1148
1150 """Returns the full sequence as a python string.
1151
1152 Note that Biopython 1.44 and earlier would give a truncated
1153 version of repr(my_seq) for str(my_seq). If you are writing code
1154 which needs to be backwards compatible with old Biopython, you
1155 should continue to use my_seq.tostring() rather than str(my_seq).
1156 """
1157
1158 return "".join(self.data)
1159
1161 """Compare the sequence for to another sequence or a string.
1162
1163 If compared to another sequence the alphabets must be compatible.
1164 Comparing DNA to RNA, or Nucleotide to Protein will raise an
1165 exception.
1166
1167 Otherwise only the sequence itself is compared, not the precise
1168 alphabet.
1169
1170 This method indirectly supports ==, < , etc."""
1171 if hasattr(other, "alphabet") :
1172
1173 if not Alphabet._check_type_compatible([self.alphabet,
1174 other.alphabet]) :
1175 raise TypeError("Incompatable alphabets %s and %s" \
1176 % (repr(self.alphabet), repr(other.alphabet)))
1177
1178 if isinstance(other, MutableSeq):
1179
1180
1181 return cmp(self.data, other.data)
1182 else :
1183 return cmp(str(self), str(other))
1184 elif isinstance(other, basestring) :
1185 return cmp(str(self), other)
1186 else :
1187 raise TypeError
1188
1190
1201
1217
1219
1220
1221
1222
1223
1224 del self.data[index]
1225
1227 """Add another sequence or string to this sequence.
1228
1229 Returns a new MutableSeq object."""
1230 if hasattr(other, "alphabet") :
1231
1232 if not Alphabet._check_type_compatible([self.alphabet,
1233 other.alphabet]) :
1234 raise TypeError("Incompatable alphabets %s and %s" \
1235 % (repr(self.alphabet), repr(other.alphabet)))
1236
1237 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
1238 if isinstance(other, MutableSeq):
1239
1240
1241 return self.__class__(self.data + other.data, a)
1242 else :
1243 return self.__class__(str(self) + str(other), a)
1244 elif isinstance(other, basestring) :
1245
1246 return self.__class__(str(self) + str(other), self.alphabet)
1247 else :
1248 raise TypeError
1249
1270
1273
1276
1277 - def pop(self, i = (-1)):
1278 c = self.data[i]
1279 del self.data[i]
1280 return c
1281
1283 for i in range(len(self.data)):
1284 if self.data[i] == item:
1285 del self.data[i]
1286 return
1287 raise ValueError("MutableSeq.remove(x): x not in list")
1288
1289 - def count(self, sub, start=0, end=sys.maxint):
1290 """Non-overlapping count method, like that of a python string.
1291
1292 This behaves like the python string method of the same name,
1293 which does a non-overlapping count!
1294
1295 Returns an integer, the number of occurrences of substring
1296 argument sub in the (sub)sequence given by [start:end].
1297 Optional arguments start and end are interpreted as in slice
1298 notation.
1299
1300 Arguments:
1301 - sub - a string or another Seq object to look for
1302 - start - optional integer, slice start
1303 - end - optional integer, slice end
1304
1305 e.g.
1306
1307 >>> from Bio.Seq import MutableSeq
1308 >>> my_mseq = MutableSeq("AAAATGA")
1309 >>> print my_mseq.count("A")
1310 5
1311 >>> print my_mseq.count("ATG")
1312 1
1313 >>> print my_mseq.count(Seq("AT"))
1314 1
1315 >>> print my_mseq.count("AT", 2, -1)
1316 1
1317
1318 HOWEVER, please note because that python strings, Seq objects and
1319 MutableSeq objects do a non-overlapping search, this may not give
1320 the answer you expect:
1321
1322 >>> "AAAA".count("AA")
1323 2
1324 >>> print MutableSeq("AAAA").count("AA")
1325 2
1326
1327 A non-overlapping search would give the answer as three!
1328 """
1329 try :
1330
1331 search = sub.tostring()
1332 except AttributeError :
1333 search = sub
1334
1335 if not isinstance(search, basestring) :
1336 raise TypeError("expected a string, Seq or MutableSeq")
1337
1338 if len(search) == 1 :
1339
1340 count = 0
1341 for c in self.data[start:end]:
1342 if c == search: count += 1
1343 return count
1344 else :
1345
1346 return self.tostring().count(search, start, end)
1347
1349 for i in range(len(self.data)):
1350 if self.data[i] == item:
1351 return i
1352 raise ValueError("MutableSeq.index(x): x not in list")
1353
1355 """Modify the mutable sequence to reverse itself.
1356
1357 No return value.
1358 """
1359 self.data.reverse()
1360
1386
1388 """Modify the mutable sequence to take on its reverse complement.
1389
1390 Trying to reverse complement a protein sequence raises an exception.
1391
1392 No return value.
1393 """
1394 self.complement()
1395 self.data.reverse()
1396
1397
1398
1399
1407
1409 """Returns the full sequence as a python string.
1410
1411 Although not formally deprecated, you are now encouraged to use
1412 str(my_seq) instead of my_seq.tostring().
1413
1414 Because str(my_seq) will give you the full sequence as a python string,
1415 there is often no need to make an explicit conversion. For example,
1416
1417 print "ID={%s}, sequence={%s}" % (my_name, my_seq)
1418
1419 On Biopython 1.44 or older you would have to have done this:
1420
1421 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
1422 """
1423 return "".join(self.data)
1424
1426 """Returns the full sequence as a new immutable Seq object.
1427
1428 >>> from Bio.Seq import Seq
1429 >>> from Bio.Alphabet import IUPAC
1430 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \
1431 IUPAC.protein)
1432 >>> my_mseq
1433 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1434 >>> my_mseq.toseq()
1435 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1436
1437 Note that the alphabet is preserved.
1438 """
1439 return Seq("".join(self.data), self.alphabet)
1440
1441
1442
1443
1444
1446 """Transcribes a DNA sequence into RNA.
1447
1448 If given a string, returns a new string object.
1449
1450 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1451
1452 Trying to transcribe a protein or RNA sequence raises an exception.
1453
1454 e.g.
1455
1456 >>> transcribe("ACTGN")
1457 'ACUGN'
1458 """
1459 if isinstance(dna, Seq) :
1460 return dna.transcribe()
1461 elif isinstance(dna, MutableSeq):
1462 return dna.toseq().transcribe()
1463 else:
1464 return dna.replace('T','U').replace('t','u')
1465
1467 """Back-transcribes an RNA sequence into DNA.
1468
1469 If given a string, returns a new string object.
1470
1471 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1472
1473 Trying to transcribe a protein or DNA sequence raises an exception.
1474
1475 e.g.
1476
1477 >>> back_transcribe("ACUGN")
1478 'ACTGN'
1479 """
1480 if isinstance(rna, Seq) :
1481 return rna.back_transcribe()
1482 elif isinstance(rna, MutableSeq):
1483 return rna.toseq().back_transcribe()
1484 else:
1485 return rna.replace('U','T').replace('u','t')
1486
1487 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False,
1488 cds=False, pos_stop="X") :
1489 """Helper function to translate a nucleotide string (PRIVATE).
1490
1491 Arguments:
1492 - sequence - a string
1493 - table - a CodonTable object (NOT a table name or id number)
1494 - stop_symbol - a single character string, what to use for terminators.
1495 - to_stop - boolean, should translation terminate at the first
1496 in frame stop codon? If there is no in-frame stop codon
1497 then translation continues to the end.
1498 - pos_stop - a single character string for a possible stop codon
1499 (e.g. TAN or NNN)
1500 - cds - Boolean, indicates this is a complete CDS. If True, this
1501 checks the sequence starts with a valid alternative start
1502 codon (which will be translated as methionine, M), that the
1503 sequence length is a multiple of three, and that there is a
1504 single in frame stop codon at the end (this will be excluded
1505 from the protein sequence, regardless of the to_stop option).
1506 If these tests fail, an exception is raised.
1507
1508 Returns a string.
1509
1510 e.g.
1511
1512 >>> from Bio.Data import CodonTable
1513 >>> table = CodonTable.ambiguous_dna_by_id[1]
1514 >>> _translate_str("AAA", table)
1515 'K'
1516 >>> _translate_str("TAR", table)
1517 '*'
1518 >>> _translate_str("TAN", table)
1519 'X'
1520 >>> _translate_str("TAN", table, pos_stop="@")
1521 '@'
1522 >>> _translate_str("TA?", table)
1523 Traceback (most recent call last):
1524 ...
1525 TranslationError: Codon 'TA?' is invalid
1526 >>> _translate_str("ATGCCCTAG", table, cds=True)
1527 'MP'
1528 >>> _translate_str("AAACCCTAG", table, cds=True)
1529 Traceback (most recent call last):
1530 ...
1531 TranslationError: First codon 'AAA' is not a start codon
1532 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
1533 Traceback (most recent call last):
1534 ...
1535 TranslationError: Extra in frame stop codon found.
1536 """
1537 sequence = sequence.upper()
1538 amino_acids = []
1539 forward_table = table.forward_table
1540 stop_codons = table.stop_codons
1541 if table.nucleotide_alphabet.letters is not None :
1542 valid_letters = set(table.nucleotide_alphabet.letters.upper())
1543 else :
1544
1545 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \
1546 IUPAC.ambiguous_rna.letters.upper())
1547 if cds :
1548 if str(sequence[:3]).upper() not in table.start_codons :
1549 raise CodonTable.TranslationError(\
1550 "First codon '%s' is not a start codon" % sequence[:3])
1551 if len(sequence) % 3 != 0 :
1552 raise CodonTable.TranslationError(\
1553 "Sequence length %i is not a multiple of three" % len(sequence))
1554 if str(sequence[-3:]).upper() not in stop_codons :
1555 raise CodonTable.TranslationError(\
1556 "Final codon '%s' is not a stop codon" % sequence[-3:])
1557
1558 sequence = sequence[3:-3]
1559 amino_acids = ["M"]
1560 n = len(sequence)
1561 for i in xrange(0,n-n%3,3) :
1562 codon = sequence[i:i+3]
1563 try :
1564 amino_acids.append(forward_table[codon])
1565 except (KeyError, CodonTable.TranslationError) :
1566
1567 if codon in table.stop_codons :
1568 if cds :
1569 raise CodonTable.TranslationError(\
1570 "Extra in frame stop codon found.")
1571 if to_stop : break
1572 amino_acids.append(stop_symbol)
1573 elif valid_letters.issuperset(set(codon)) :
1574
1575 amino_acids.append(pos_stop)
1576 else :
1577 raise CodonTable.TranslationError(\
1578 "Codon '%s' is invalid" % codon)
1579 return "".join(amino_acids)
1580
1581 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False,
1582 cds=False):
1583 """Translate a nucleotide sequence into amino acids.
1584
1585 If given a string, returns a new string object. Given a Seq or
1586 MutableSeq, returns a Seq object with a protein alphabet.
1587
1588 Arguments:
1589 - table - Which codon table to use? This can be either a name
1590 (string) or an NCBI identifier (integer). Defaults
1591 to the "Standard" table.
1592 - stop_symbol - Single character string, what to use for any
1593 terminators, defaults to the asterisk, "*".
1594 - to_stop - Boolean, defaults to False meaning do a full
1595 translation continuing on past any stop codons
1596 (translated as the specified stop_symbol). If
1597 True, translation is terminated at the first in
1598 frame stop codon (and the stop_symbol is not
1599 appended to the returned protein sequence).
1600 - cds - Boolean, indicates this is a complete CDS. If True, this
1601 checks the sequence starts with a valid alternative start
1602 codon (which will be translated as methionine, M), that the
1603 sequence length is a multiple of three, and that there is a
1604 single in frame stop codon at the end (this will be excluded
1605 from the protein sequence, regardless of the to_stop option).
1606 If these tests fail, an exception is raised.
1607
1608 A simple string example using the default (standard) genetic code:
1609
1610 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
1611 >>> translate(coding_dna)
1612 'VAIVMGR*KGAR*'
1613 >>> translate(coding_dna, stop_symbol="@")
1614 'VAIVMGR@KGAR@'
1615 >>> translate(coding_dna, to_stop=True)
1616 'VAIVMGR'
1617
1618 Now using NCBI table 2, where TGA is not a stop codon:
1619
1620 >>> translate(coding_dna, table=2)
1621 'VAIVMGRWKGAR*'
1622 >>> translate(coding_dna, table=2, to_stop=True)
1623 'VAIVMGRWKGAR'
1624
1625 In fact this example uses an alternative start codon valid under NCBI table 2,
1626 GTG, which means this example is a complete valid CDS which when translated
1627 should really start with methionine (not valine):
1628
1629 >>> translate(coding_dna, table=2, cds=True)
1630 'MAIVMGRWKGAR'
1631
1632 Note that if the sequence has no in-frame stop codon, then the to_stop
1633 argument has no effect:
1634
1635 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
1636 >>> translate(coding_dna2)
1637 'VAIVMGR'
1638 >>> translate(coding_dna2, to_stop=True)
1639 'VAIVMGR'
1640
1641 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
1642 or a stop codon. These are translated as "X". Any invalid codon
1643 (e.g. "TA?" or "T-A") will throw a TranslationError.
1644
1645 NOTE - Does NOT support gapped sequences.
1646
1647 It will however translate either DNA or RNA.
1648 """
1649 if isinstance(sequence, Seq) :
1650 return sequence.translate(table, stop_symbol, to_stop, cds)
1651 elif isinstance(sequence, MutableSeq):
1652
1653 return sequence.toseq().translate(table, stop_symbol, to_stop, cds)
1654 else:
1655
1656 try :
1657 codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
1658 except ValueError :
1659 codon_table = CodonTable.ambiguous_generic_by_name[table]
1660 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
1661
1695
1697 """Run the Bio.Seq module's doctests."""
1698 print "Runing doctests..."
1699 import doctest
1700 doctest.testmod()
1701 print "Done"
1702
1703 if __name__ == "__main__":
1704 _test()
1705