1
2
3
4
5
6
7
8 """Provides objects to represent biological sequences with alphabets.
9
10 See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial:
11 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
12 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
13 """
14 __docformat__ ="epytext en"
15
16 import string
17 import array
18 import sys
19
20
21 try:
22 set = set
23 except NameError:
24 from sets import Set as set
25
26 import Alphabet
27 from Alphabet import IUPAC
28 from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
29 from Bio.Data import CodonTable
30
32 """Makes a python string translation table (PRIVATE).
33
34 Arguments:
35 - complement_mapping - a dictionary such as ambiguous_dna_complement
36 and ambiguous_rna_complement from Data.IUPACData.
37
38 Returns a translation table (a string of length 256) for use with the
39 python string's translate method to use in a (reverse) complement.
40
41 Compatible with lower case and upper case sequences.
42
43 For internal use only.
44 """
45 before = ''.join(complement_mapping.keys())
46 after = ''.join(complement_mapping.values())
47 before = before + before.lower()
48 after = after + after.lower()
49 return string.maketrans(before, after)
50
51 _dna_complement_table = _maketrans(ambiguous_dna_complement)
52 _rna_complement_table = _maketrans(ambiguous_rna_complement)
53
55 """A read-only sequence object (essentially a string with an alphabet).
56
57 Like normal python strings, our basic sequence object is immutable.
58 This prevents you from doing my_seq[5] = "A" for example, but does allow
59 Seq objects to be used as dictionary keys.
60
61 The Seq object provides a number of string like methods (such as count,
62 find, split and strip), which are alphabet aware where appropriate.
63
64 The Seq object also provides some biological methods, such as complement,
65 reverse_complement, transcribe, back_transcribe and translate (which are
66 not applicable to sequences with a protein alphabet).
67 """
69 """Create a Seq object.
70
71 Arguments:
72 - seq - Sequence, required (string)
73 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet
74
75 You will typically use Bio.SeqIO to read in sequences from files as
76 SeqRecord objects, whose sequence will be exposed as a Seq object via
77 the seq property.
78
79 However, will often want to create your own Seq objects directly:
80
81 >>> from Bio.Seq import Seq
82 >>> from Bio.Alphabet import IUPAC
83 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
84 ... IUPAC.protein)
85 >>> my_seq
86 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
87 >>> print my_seq
88 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
89 """
90
91 assert (type(data) == type("") or
92 type(data) == type(u""))
93 self._data = data
94 self.alphabet = alphabet
95
96
98
99
100 import warnings
101 warnings.warn("Writing to the Seq object's .data propery is deprecated.",
102 DeprecationWarning)
103 self._data = value
104 data = property(fget= lambda self : str(self),
105 fset=_set_data,
106 doc="Sequence as a string (DEPRECATED)")
107
109 """Returns a (truncated) representation of the sequence for debugging."""
110 if len(self) > 60 :
111
112
113
114 return "%s('%s...%s', %s)" % (self.__class__.__name__,
115 str(self)[:54], str(self)[-3:],
116 repr(self.alphabet))
117 else :
118 return "%s(%s, %s)" % (self.__class__.__name__,
119 repr(self.data),
120 repr(self.alphabet))
122 """Returns the full sequence as a python string.
123
124 Note that Biopython 1.44 and earlier would give a truncated
125 version of repr(my_seq) for str(my_seq). If you are writing code
126 which need to be backwards compatible with old Biopython, you
127 should continue to use my_seq.tostring() rather than str(my_seq).
128 """
129 return self._data
130
131 """
132 TODO - Work out why this breaks test_Restriction.py
133 (Comparing Seq objects would be nice to have. May need to think about
134 hashes and the in operator for when have list/dictionary of Seq objects...)
135 def __cmp__(self, other):
136 if hasattr(other, "alphabet") :
137 #other should be a Seq or a MutableSeq
138 if not Alphabet._check_type_compatible([self.alphabet,
139 other.alphabet]) :
140 raise TypeError("Incompatable alphabets %s and %s" \
141 % (repr(self.alphabet), repr(other.alphabet)))
142 #They should be the same sequence type (or one of them is generic)
143 return cmp(str(self), str(other))
144 elif isinstance(other, basestring) :
145 return cmp(str(self), other)
146 else :
147 raise TypeError
148 """
149
150 - def __len__(self): return len(self._data)
151
153
154
155
156 if isinstance(index, int) :
157
158 return self._data[index]
159 else :
160
161 return Seq(self._data[index], self.alphabet)
162
164 """Add another sequence or string to this sequence."""
165 if hasattr(other, "alphabet") :
166
167 if not Alphabet._check_type_compatible([self.alphabet,
168 other.alphabet]) :
169 raise TypeError("Incompatable alphabets %s and %s" \
170 % (repr(self.alphabet), repr(other.alphabet)))
171
172 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
173 return self.__class__(str(self) + str(other), a)
174 elif isinstance(other, basestring) :
175
176 return self.__class__(str(self) + other, self.alphabet)
177 else :
178 raise TypeError
179
181 if hasattr(other, "alphabet") :
182
183 if not Alphabet._check_type_compatible([self.alphabet,
184 other.alphabet]) :
185 raise TypeError("Incompatable alphabets %s and %s" \
186 % (repr(self.alphabet), repr(other.alphabet)))
187
188 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
189 return self.__class__(str(other) + str(self), a)
190 elif isinstance(other, basestring) :
191
192 return self.__class__(other + str(self), self.alphabet)
193 else :
194 raise TypeError
195
197 """Returns the full sequence as a python string.
198
199 Although not formally deprecated, you are now encouraged to use
200 str(my_seq) instead of my_seq.tostring()."""
201 return str(self)
202
204 """Returns the full sequence as a MutableSeq object.
205
206 >>> from Bio.Seq import Seq
207 >>> from Bio.Alphabet import IUPAC
208 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL",
209 ... IUPAC.protein)
210 >>> my_seq
211 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
212 >>> my_seq.tomutable()
213 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
214
215 Note that the alphabet is preserved.
216 """
217 return MutableSeq(str(self), self.alphabet)
218
220 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
221
222 For a string argument, returns the string.
223
224 For a Seq or MutableSeq, it checks the alphabet is compatible
225 (raising an exception if it isn't), and then returns a string.
226 """
227 try :
228 other_alpha = other_sequence.alphabet
229 except AttributeError :
230
231 return other_sequence
232
233
234 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]) :
235 raise TypeError("Incompatable alphabets %s and %s" \
236 % (repr(self.alphabet), repr(other_alpha)))
237
238 return str(other_sequence)
239
240 - def count(self, sub, start=0, end=sys.maxint):
241 """Non-overlapping count method, like that of a python string.
242
243 This behaves like the python string method of the same name,
244 which does a non-overlapping count!
245
246 Returns an integer, the number of occurrences of substring
247 argument sub in the (sub)sequence given by [start:end].
248 Optional arguments start and end are interpreted as in slice
249 notation.
250
251 Arguments:
252 - sub - a string or another Seq object to look for
253 - start - optional integer, slice start
254 - end - optional integer, slice end
255
256 e.g.
257
258 >>> from Bio.Seq import Seq
259 >>> my_seq = Seq("AAAATGA")
260 >>> print my_seq.count("A")
261 5
262 >>> print my_seq.count("ATG")
263 1
264 >>> print my_seq.count(Seq("AT"))
265 1
266 >>> print my_seq.count("AT", 2, -1)
267 1
268
269 HOWEVER, please note because python strings and Seq objects (and
270 MutableSeq objects) do a non-overlapping search, this may not give
271 the answer you expect:
272
273 >>> "AAAA".count("AA")
274 2
275 >>> print Seq("AAAA").count("AA")
276 2
277
278 A non-overlapping search would give the answer as three!
279 """
280
281 sub_str = self._get_seq_str_and_check_alphabet(sub)
282 return str(self).count(sub_str, start, end)
283
284 - def find(self, sub, start=0, end=sys.maxint):
285 """Find method, like that of a python string.
286
287 This behaves like the python string method of the same name.
288
289 Returns an integer, the index of the first occurrence of substring
290 argument sub in the (sub)sequence given by [start:end].
291
292 Arguments:
293 - sub - a string or another Seq object to look for
294 - start - optional integer, slice start
295 - end - optional integer, slice end
296
297 Returns -1 if the subsequence is NOT found.
298
299 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
300
301 >>> from Bio.Seq import Seq
302 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
303 >>> my_rna.find("AUG")
304 3
305 """
306
307 sub_str = self._get_seq_str_and_check_alphabet(sub)
308 return str(self).find(sub_str, start, end)
309
310 - def rfind(self, sub, start=0, end=sys.maxint):
311 """Find from right method, like that of a python string.
312
313 This behaves like the python string method of the same name.
314
315 Returns an integer, the index of the last (right most) occurrence of
316 substring argument sub in the (sub)sequence given by [start:end].
317
318 Arguments:
319 - sub - a string or another Seq object to look for
320 - start - optional integer, slice start
321 - end - optional integer, slice end
322
323 Returns -1 if the subsequence is NOT found.
324
325 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
326
327 >>> from Bio.Seq import Seq
328 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
329 >>> my_rna.rfind("AUG")
330 15
331 """
332
333 sub_str = self._get_seq_str_and_check_alphabet(sub)
334 return str(self).rfind(sub_str, start, end)
335
336 - def startswith(self, prefix, start=0, end=sys.maxint) :
337 """Does the Seq start with the given prefix? Returns True/False.
338
339 This behaves like the python string method of the same name.
340
341 Return True if the sequence starts with the specified prefix
342 (a string or another Seq object), False otherwise.
343 With optional start, test sequence beginning at that position.
344 With optional end, stop comparing sequence at that position.
345 prefix can also be a tuple of strings to try. e.g.
346
347 >>> from Bio.Seq import Seq
348 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
349 >>> my_rna.startswith("GUC")
350 True
351 >>> my_rna.startswith("AUG")
352 False
353 >>> my_rna.startswith("AUG", 3)
354 True
355 >>> my_rna.startswith(("UCC","UCA","UCG"),1)
356 True
357 """
358
359 if isinstance(prefix, tuple) :
360
361
362
363 prefix_strings = [self._get_seq_str_and_check_alphabet(p) \
364 for p in prefix]
365 for prefix_str in prefix_strings :
366 if str(self).startswith(prefix_str, start, end) :
367 return True
368 return False
369 else :
370 prefix_str = self._get_seq_str_and_check_alphabet(prefix)
371 return str(self).startswith(prefix_str, start, end)
372
373 - def endswith(self, suffix, start=0, end=sys.maxint) :
374 """Does the Seq end with the given suffix? Returns True/False.
375
376 This behaves like the python string method of the same name.
377
378 Return True if the sequence ends with the specified suffix
379 (a string or another Seq object), False otherwise.
380 With optional start, test sequence beginning at that position.
381 With optional end, stop comparing sequence at that position.
382 suffix can also be a tuple of strings to try. e.g.
383
384 >>> from Bio.Seq import Seq
385 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
386 >>> my_rna.endswith("UUG")
387 True
388 >>> my_rna.endswith("AUG")
389 False
390 >>> my_rna.endswith("AUG", 0, 18)
391 True
392 >>> my_rna.endswith(("UCC","UCA","UUG"))
393 True
394 """
395
396 if isinstance(suffix, tuple) :
397
398
399
400 suffix_strings = [self._get_seq_str_and_check_alphabet(p) \
401 for p in suffix]
402 for suffix_str in suffix_strings :
403 if str(self).endswith(suffix_str, start, end) :
404 return True
405 return False
406 else :
407 suffix_str = self._get_seq_str_and_check_alphabet(suffix)
408 return str(self).endswith(suffix_str, start, end)
409
410
411 - def split(self, sep=None, maxsplit=-1) :
412 """Split method, like that of a python string.
413
414 This behaves like the python string method of the same name.
415
416 Return a list of the 'words' in the string (as Seq objects),
417 using sep as the delimiter string. If maxsplit is given, at
418 most maxsplit splits are done. If maxsplit is ommited, all
419 splits are made.
420
421 Following the python string method, sep will by default be any
422 white space (tabs, spaces, newlines) but this is unlikely to
423 apply to biological sequences.
424
425 e.g.
426
427 >>> from Bio.Seq import Seq
428 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
429 >>> my_aa = my_rna.translate()
430 >>> my_aa
431 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
432 >>> my_aa.split("*")
433 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
434 >>> my_aa.split("*",1)
435 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
436
437 See also the rsplit method:
438
439 >>> my_aa.rsplit("*",1)
440 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
441 """
442
443 sep_str = self._get_seq_str_and_check_alphabet(sep)
444
445
446 return [Seq(part, self.alphabet) \
447 for part in str(self).split(sep_str, maxsplit)]
448
449 - def rsplit(self, sep=None, maxsplit=-1) :
450 """Right split method, like that of a python string.
451
452 This behaves like the python string method of the same name.
453
454 Return a list of the 'words' in the string (as Seq objects),
455 using sep as the delimiter string. If maxsplit is given, at
456 most maxsplit splits are done COUNTING FROM THE RIGHT.
457 If maxsplit is ommited, all splits are made.
458
459 Following the python string method, sep will by default be any
460 white space (tabs, spaces, newlines) but this is unlikely to
461 apply to biological sequences.
462
463 e.g. print my_seq.rsplit("*",1)
464
465 See also the split method.
466 """
467
468 sep_str = self._get_seq_str_and_check_alphabet(sep)
469 try :
470 return [Seq(part, self.alphabet) \
471 for part in str(self).rsplit(sep_str, maxsplit)]
472 except AttributeError :
473
474
475
476 words = [Seq(word[::-1], self.alphabet) for word \
477 in str(self)[::-1].split(sep_str[::-1], maxsplit)]
478 words.reverse()
479 return words
480
481 - def strip(self, chars=None) :
482 """Returns a new Seq object with leading and trailing ends stripped.
483
484 This behaves like the python string method of the same name.
485
486 Optional argument chars defines which characters to remove. If
487 ommitted or None (default) then as for the python string method,
488 this defaults to removing any white space.
489
490 e.g. print my_seq.strip("-")
491
492 See also the lstrip and rstrip methods.
493 """
494
495 strip_str = self._get_seq_str_and_check_alphabet(chars)
496 return Seq(str(self).strip(strip_str), self.alphabet)
497
498 - def lstrip(self, chars=None) :
499 """Returns a new Seq object with leading (left) end stripped.
500
501 This behaves like the python string method of the same name.
502
503 Optional argument chars defines which characters to remove. If
504 ommitted or None (default) then as for the python string method,
505 this defaults to removing any white space.
506
507 e.g. print my_seq.lstrip("-")
508
509 See also the strip and rstrip methods.
510 """
511
512 strip_str = self._get_seq_str_and_check_alphabet(chars)
513 return Seq(str(self).lstrip(strip_str), self.alphabet)
514
515 - def rstrip(self, chars=None) :
516 """Returns a new Seq object with trailing (right) end stripped.
517
518 This behaves like the python string method of the same name.
519
520 Optional argument chars defines which characters to remove. If
521 ommitted or None (default) then as for the python string method,
522 this defaults to removing any white space.
523
524 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
525
526 >>> from Bio.Alphabet import IUPAC
527 >>> from Bio.Seq import Seq
528 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
529 >>> my_seq
530 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
531 >>> my_seq.rstrip("A")
532 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
533
534 See also the strip and lstrip methods.
535 """
536
537 strip_str = self._get_seq_str_and_check_alphabet(chars)
538 return Seq(str(self).rstrip(strip_str), self.alphabet)
539
541 """Returns the complement sequence. New Seq object.
542
543 >>> from Bio.Seq import Seq
544 >>> from Bio.Alphabet import IUPAC
545 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
546 >>> my_dna
547 Seq('CCCCCGATAG', IUPACUnambiguousDNA())
548 >>> my_dna.complement()
549 Seq('GGGGGCTATC', IUPACUnambiguousDNA())
550
551 You can of course used mixed case sequences,
552
553 >>> from Bio.Seq import Seq
554 >>> from Bio.Alphabet import generic_dna
555 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna)
556 >>> my_dna
557 Seq('CCCCCgatA-GD', DNAAlphabet())
558 >>> my_dna.complement()
559 Seq('GGGGGctaT-CH', DNAAlphabet())
560
561 Note in the above example, ambiguous character D denotes
562 G, A or T so its complement is H (for C, T or A).
563
564 Trying to complement a protein sequence raises an exception.
565
566 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
567 >>> my_protein.complement()
568 Traceback (most recent call last):
569 ...
570 ValueError: Proteins do not have complements!
571 """
572 base = Alphabet._get_base_alphabet(self.alphabet)
573 if isinstance(base, Alphabet.ProteinAlphabet) :
574 raise ValueError("Proteins do not have complements!")
575 if isinstance(base, Alphabet.DNAAlphabet) :
576 ttable = _dna_complement_table
577 elif isinstance(base, Alphabet.RNAAlphabet) :
578 ttable = _rna_complement_table
579 elif ('U' in self._data or 'u' in self._data) \
580 and ('T' in self._data or 't' in self._data):
581
582 raise ValueError("Mixed RNA/DNA found")
583 elif 'U' in self._data or 'u' in self._data:
584 ttable = _rna_complement_table
585 else:
586 ttable = _dna_complement_table
587
588
589 return Seq(str(self).translate(ttable), self.alphabet)
590
592 """Returns the reverse complement sequence. New Seq object.
593
594 >>> from Bio.Seq import Seq
595 >>> from Bio.Alphabet import IUPAC
596 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna)
597 >>> my_dna
598 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA())
599 >>> my_dna.reverse_complement()
600 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA())
601
602 Note in the above example, since R = G or A, its complement
603 is Y (which denotes C or T).
604
605 You can of course used mixed case sequences,
606
607 >>> from Bio.Seq import Seq
608 >>> from Bio.Alphabet import generic_dna
609 >>> my_dna = Seq("CCCCCgatA-G", generic_dna)
610 >>> my_dna
611 Seq('CCCCCgatA-G', DNAAlphabet())
612 >>> my_dna.reverse_complement()
613 Seq('C-TatcGGGGG', DNAAlphabet())
614
615 Trying to complement a protein sequence raises an exception:
616
617 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
618 >>> my_protein.reverse_complement()
619 Traceback (most recent call last):
620 ...
621 ValueError: Proteins do not have complements!
622 """
623
624 return self.complement()[::-1]
625
627 """Returns the RNA sequence from a DNA sequence. New Seq object.
628
629 >>> from Bio.Seq import Seq
630 >>> from Bio.Alphabet import IUPAC
631 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
632 ... IUPAC.unambiguous_dna)
633 >>> coding_dna
634 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
635 >>> coding_dna.transcribe()
636 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
637
638 Trying to transcribe a protein or RNA sequence raises an exception:
639
640 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
641 >>> my_protein.transcribe()
642 Traceback (most recent call last):
643 ...
644 ValueError: Proteins cannot be transcribed!
645 """
646 base = Alphabet._get_base_alphabet(self.alphabet)
647 if isinstance(base, Alphabet.ProteinAlphabet) :
648 raise ValueError("Proteins cannot be transcribed!")
649 if isinstance(base, Alphabet.RNAAlphabet) :
650 raise ValueError("RNA cannot be transcribed!")
651
652 if self.alphabet==IUPAC.unambiguous_dna:
653 alphabet = IUPAC.unambiguous_rna
654 elif self.alphabet==IUPAC.ambiguous_dna:
655 alphabet = IUPAC.ambiguous_rna
656 else:
657 alphabet = Alphabet.generic_rna
658 return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
659
661 """Returns the DNA sequence from an RNA sequence. New Seq object.
662
663 >>> from Bio.Seq import Seq
664 >>> from Bio.Alphabet import IUPAC
665 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
666 ... IUPAC.unambiguous_rna)
667 >>> messenger_rna
668 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
669 >>> messenger_rna.back_transcribe()
670 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
671
672 Trying to back-transcribe a protein or DNA sequence raises an
673 exception:
674
675 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
676 >>> my_protein.back_transcribe()
677 Traceback (most recent call last):
678 ...
679 ValueError: Proteins cannot be back transcribed!
680 """
681 base = Alphabet._get_base_alphabet(self.alphabet)
682 if isinstance(base, Alphabet.ProteinAlphabet) :
683 raise ValueError("Proteins cannot be back transcribed!")
684 if isinstance(base, Alphabet.DNAAlphabet) :
685 raise ValueError("DNA cannot be back transcribed!")
686
687 if self.alphabet==IUPAC.unambiguous_rna:
688 alphabet = IUPAC.unambiguous_dna
689 elif self.alphabet==IUPAC.ambiguous_rna:
690 alphabet = IUPAC.ambiguous_dna
691 else:
692 alphabet = Alphabet.generic_dna
693 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
694
695 - def translate(self, table="Standard", stop_symbol="*", to_stop=False):
696 """Turns a nucleotide sequence into a protein sequence. New Seq object.
697
698 This method will translate DNA or RNA sequences, and those with a
699 nucleotide or generic alphabet. Trying to translate a protein
700 sequence raises an exception.
701
702 Arguments:
703 - table - Which codon table to use? This can be either a name
704 (string) or an NCBI identifier (integer). This defaults
705 to the "Standard" table.
706 - stop_symbol - Single character string, what to use for terminators.
707 This defaults to the asterisk, "*".
708 - to_stop - Boolean, defaults to False meaning do a full translation
709 continuing on past any stop codons (translated as the
710 specified stop_symbol). If True, translation is
711 terminated at the first in frame stop codon (and the
712 stop_symbol is not appended to the returned protein
713 sequence).
714
715 e.g. Using the standard table:
716
717 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
718 >>> coding_dna.translate()
719 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
720 >>> coding_dna.translate(stop_symbol="@")
721 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
722 >>> coding_dna.translate(to_stop=True)
723 Seq('VAIVMGR', ExtendedIUPACProtein())
724
725 Now using NCBI table 2, where TGA is not a stop codon:
726
727 >>> coding_dna.translate(table=2)
728 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
729 >>> coding_dna.translate(table=2, to_stop=True)
730 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
731
732 If the sequence has no in-frame stop codon, then the to_stop argument
733 has no effect:
734
735 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
736 >>> coding_dna2.translate()
737 Seq('LAIVMGR', ExtendedIUPACProtein())
738 >>> coding_dna2.translate(to_stop=True)
739 Seq('LAIVMGR', ExtendedIUPACProtein())
740
741 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
742 or a stop codon. These are translated as "X". Any invalid codon
743 (e.g. "TA?" or "T-A") will throw a TranslationError.
744
745 NOTE - Does NOT support gapped sequences.
746
747 NOTE - This does NOT behave like the python string's translate
748 method. For that use str(my_seq).translate(...) instead.
749 """
750 try:
751 table_id = int(table)
752 except ValueError:
753 table_id = None
754 if isinstance(table, str) and len(table)==256 :
755 raise ValueError("The Seq object translate method DOES NOT take " \
756 + "a 256 character string mapping table like " \
757 + "the python string object's translate method. " \
758 + "Use str(my_seq).translate(...) instead.")
759 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
760 Alphabet.ProteinAlphabet) :
761 raise ValueError("Proteins cannot be translated!")
762 if self.alphabet==IUPAC.unambiguous_dna:
763
764 if table_id is None:
765 codon_table = CodonTable.unambiguous_dna_by_name[table]
766 else:
767 codon_table = CodonTable.unambiguous_dna_by_id[table_id]
768
769
770
771
772
773 elif self.alphabet==IUPAC.unambiguous_rna:
774
775 if table_id is None:
776 codon_table = CodonTable.unambiguous_rna_by_name[table]
777 else:
778 codon_table = CodonTable.unambiguous_rna_by_id[table_id]
779
780
781
782
783
784 else:
785
786
787
788 if table_id is None:
789 codon_table = CodonTable.ambiguous_generic_by_name[table]
790 else:
791 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
792 protein = _translate_str(str(self), codon_table, stop_symbol, to_stop)
793 if stop_symbol in protein :
794 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
795 stop_symbol = stop_symbol)
796 else :
797 alphabet = codon_table.protein_alphabet
798 return Seq(protein, alphabet)
799
801 """A read-only sequence object of known length but unknown contents.
802
803 If you have an unknown sequence, you can represent this with a normal
804 Seq object, for example:
805
806 >>> my_seq = Seq("N"*5)
807 >>> my_seq
808 Seq('NNNNN', Alphabet())
809 >>> len(my_seq)
810 5
811 >>> print my_seq
812 NNNNN
813
814 However, this is rather wasteful of memory (especially for large
815 sequences), which is where this class is most usefull:
816
817 >>> unk_five = UnknownSeq(5)
818 >>> unk_five
819 UnknownSeq(5, alphabet = Alphabet(), character = '?')
820 >>> len(unk_five)
821 5
822 >>> print(unk_five)
823 ?????
824
825 You can add unknown sequence together, provided their alphabets and
826 characters are compatible, and get another memory saving UnknownSeq:
827
828 >>> unk_four = UnknownSeq(4)
829 >>> unk_four
830 UnknownSeq(4, alphabet = Alphabet(), character = '?')
831 >>> unk_four + unk_five
832 UnknownSeq(9, alphabet = Alphabet(), character = '?')
833
834 If the alphabet or characters don't match up, the addition gives an
835 ordinary Seq object:
836
837 >>> unk_nnnn = UnknownSeq(4, character = "N")
838 >>> unk_nnnn
839 UnknownSeq(4, alphabet = Alphabet(), character = 'N')
840 >>> unk_nnnn + unk_four
841 Seq('NNNN????', Alphabet())
842
843 Combining with a real Seq gives a new Seq object:
844
845 >>> known_seq = Seq("ACGT")
846 >>> unk_four + known_seq
847 Seq('????ACGT', Alphabet())
848 >>> known_seq + unk_four
849 Seq('ACGT????', Alphabet())
850 """
852 """Create a new UnknownSeq object.
853
854 If character is ommited, it is determed from the alphabet, "N" for
855 nucleotides, "X" for proteins, and "?" otherwise.
856 """
857 self._length = int(length)
858 if self._length < 0 :
859
860 raise ValueError("Length must not be negative.")
861 self.alphabet = alphabet
862 if character :
863 if len(character) != 1 :
864 raise ValueError("character argument should be a single letter string.")
865 self._character = character
866 else :
867 base = Alphabet._get_base_alphabet(alphabet)
868
869
870 if isinstance(base, Alphabet.NucleotideAlphabet) :
871 self._character = "N"
872 elif isinstance(base, Alphabet.ProteinAlphabet) :
873 self._character = "X"
874 else :
875 self._character = "?"
876
878 """Returns the stated length of the unknown sequence."""
879 return self._length
880
882 """Returns the unknown sequence as full string of the given length."""
883 return self._character * self._length
884
886 return "UnknownSeq(%i, alphabet = %s, character = %s)" \
887 % (self._length, repr(self.alphabet), repr(self._character))
888
890 if isinstance(other, UnknownSeq) \
891 and other._character == self._character :
892
893 return UnknownSeq(len(self)+len(other),
894 self.alphabet, self._character)
895
896 return Seq(str(self), self.alphabet) + other
897
899 if isinstance(other, UnknownSeq) \
900 and other._character == self._character :
901
902 return UnknownSeq(len(self)+len(other),
903 self.alphabet, self._character)
904
905 return other + Seq(str(self), self.alphabet)
906
915
916 - def count(self, sub, start=0, end=sys.maxint):
917 """Non-overlapping count method, like that of a python string.
918
919 This behaves like the python string (and Seq object) method of the
920 same name, which does a non-overlapping count!
921
922 Returns an integer, the number of occurrences of substring
923 argument sub in the (sub)sequence given by [start:end].
924 Optional arguments start and end are interpreted as in slice
925 notation.
926
927 Arguments:
928 - sub - a string or another Seq object to look for
929 - start - optional integer, slice start
930 - end - optional integer, slice end
931
932 >>> "NNNN".count("N")
933 4
934 >>> Seq("NNNN").count("N")
935 4
936 >>> UnknownSeq(4, character="N").count("N")
937 4
938 >>> UnknownSeq(4, character="N").count("A")
939 0
940 >>> UnknownSeq(4, character="N").count("AA")
941 0
942
943 HOWEVER, please note because that python strings and Seq objects (and
944 MutableSeq objects) do a non-overlapping search, this may not give
945 the answer you expect:
946
947 >>> UnknownSeq(4, character="N").count("NN")
948 2
949 >>> UnknownSeq(4, character="N").count("NNN")
950 1
951 """
952 sub_str = self._get_seq_str_and_check_alphabet(sub)
953 if len(sub_str) == 1 :
954 if str(sub_str) == self._character :
955 if start==0 and end >= self._length :
956 return self._length
957 else :
958
959 return str(self).count(sub_str, start, end)
960 else :
961 return 0
962 else :
963 if set(sub_str) == set(self._character) :
964 if start==0 and end >= self._length :
965 return self._length // len(sub_str)
966 else :
967
968 return str(self).count(sub_str, start, end)
969 else :
970 return 0
971
973 """The complement of an unknown nucleotide equals itself.
974
975 >>> my_nuc = UnknownSeq(8)
976 >>> my_nuc
977 UnknownSeq(8, alphabet = Alphabet(), character = '?')
978 >>> print my_nuc
979 ????????
980 >>> my_nuc.complement()
981 UnknownSeq(8, alphabet = Alphabet(), character = '?')
982 >>> print my_nuc.complement()
983 ????????
984 """
985 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
986 Alphabet.ProteinAlphabet) :
987 raise ValueError("Proteins do not have complements!")
988 return self
989
991 """The reverse complement of an unknown nucleotide equals itself.
992
993 >>> my_nuc = UnknownSeq(10)
994 >>> my_nuc
995 UnknownSeq(10, alphabet = Alphabet(), character = '?')
996 >>> print my_nuc
997 ??????????
998 >>> my_nuc.reverse_complement()
999 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1000 >>> print my_nuc.reverse_complement()
1001 ??????????
1002 """
1003 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1004 Alphabet.ProteinAlphabet) :
1005 raise ValueError("Proteins do not have complements!")
1006 return self
1007
1009 """Returns unknown RNA sequence from an unknown DNA sequence.
1010
1011 >>> my_dna = UnknownSeq(10, character="N")
1012 >>> my_dna
1013 UnknownSeq(10, alphabet = Alphabet(), character = 'N')
1014 >>> print my_dna
1015 NNNNNNNNNN
1016 >>> my_rna = my_dna.transcribe()
1017 >>> my_rna
1018 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N')
1019 >>> print my_rna
1020 NNNNNNNNNN
1021 """
1022
1023 s = Seq(self._character, self.alphabet).transcribe()
1024 return UnknownSeq(self._length, s.alphabet, self._character)
1025
1027 """Returns unknown DNA sequence from an unknown RNA sequence.
1028
1029 >>> my_rna = UnknownSeq(20, character="N")
1030 >>> my_rna
1031 UnknownSeq(20, alphabet = Alphabet(), character = 'N')
1032 >>> print my_rna
1033 NNNNNNNNNNNNNNNNNNNN
1034 >>> my_dna = my_rna.back_transcribe()
1035 >>> my_dna
1036 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1037 >>> print my_dna
1038 NNNNNNNNNNNNNNNNNNNN
1039 """
1040
1041 s = Seq(self._character, self.alphabet).back_transcribe()
1042 return UnknownSeq(self._length, s.alphabet, self._character)
1043
1045 """Translate an unknown nucleotide sequence into an unknown protein.
1046
1047 e.g.
1048
1049 >>> my_seq = UnknownSeq(11, character="N")
1050 >>> print my_seq
1051 NNNNNNNNNNN
1052 >>> my_protein = my_seq.translate()
1053 >>> my_protein
1054 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X')
1055 >>> print my_protein
1056 XXX
1057
1058 In comparison, using a normal Seq object:
1059
1060 >>> my_seq = Seq("NNNNNNNNNNN")
1061 >>> print my_seq
1062 NNNNNNNNNNN
1063 >>> my_protein = my_seq.translate()
1064 >>> my_protein
1065 Seq('XXX', ExtendedIUPACProtein())
1066 >>> print my_protein
1067 XXX
1068
1069 """
1070 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1071 Alphabet.ProteinAlphabet) :
1072 raise ValueError("Proteins cannot be translated!")
1073 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1074
1075
1077 """An editable sequence object (with an alphabet).
1078
1079 Unlike normal python strings and our basic sequence object (the Seq class)
1080 which are immuatable, the MutableSeq lets you edit the sequence in place.
1081 However, this means you cannot use a MutableSeq object as a dictionary key.
1082
1083 >>> from Bio.Seq import MutableSeq
1084 >>> from Bio.Alphabet import generic_dna
1085 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
1086 >>> my_seq
1087 MutableSeq('ACTCGTCGTCG', DNAAlphabet())
1088 >>> my_seq[5]
1089 'T'
1090 >>> my_seq[5] = "A"
1091 >>> my_seq
1092 MutableSeq('ACTCGACGTCG', DNAAlphabet())
1093 >>> my_seq[5]
1094 'A'
1095 >>> my_seq[5:8] = "NNN"
1096 >>> my_seq
1097 MutableSeq('ACTCGNNNTCG', DNAAlphabet())
1098 >>> len(my_seq)
1099 11
1100
1101 Note that the MutableSeq object does not support as many string-like
1102 or biological methods as the Seq object.
1103 """
1110
1112 """Returns a (truncated) representation of the sequence for debugging."""
1113 if len(self) > 60 :
1114
1115
1116
1117 return "%s('%s...%s', %s)" % (self.__class__.__name__,
1118 str(self[:54]), str(self[-3:]),
1119 repr(self.alphabet))
1120 else :
1121 return "%s('%s', %s)" % (self.__class__.__name__,
1122 str(self),
1123 repr(self.alphabet))
1124
1126 """Returns the full sequence as a python string.
1127
1128 Note that Biopython 1.44 and earlier would give a truncated
1129 version of repr(my_seq) for str(my_seq). If you are writing code
1130 which needs to be backwards compatible with old Biopython, you
1131 should continue to use my_seq.tostring() rather than str(my_seq).
1132 """
1133
1134 return "".join(self.data)
1135
1137 """Compare the sequence for to another sequence or a string.
1138
1139 If compared to another sequence the alphabets must be compatible.
1140 Comparing DNA to RNA, or Nucleotide to Protein will raise an
1141 exception.
1142
1143 Otherwise only the sequence itself is compared, not the precise
1144 alphabet.
1145
1146 This method indirectly supports ==, < , etc."""
1147 if hasattr(other, "alphabet") :
1148
1149 if not Alphabet._check_type_compatible([self.alphabet,
1150 other.alphabet]) :
1151 raise TypeError("Incompatable alphabets %s and %s" \
1152 % (repr(self.alphabet), repr(other.alphabet)))
1153
1154 if isinstance(other, MutableSeq):
1155
1156
1157 return cmp(self.data, other.data)
1158 else :
1159 return cmp(str(self), str(other))
1160 elif isinstance(other, basestring) :
1161 return cmp(str(self), other)
1162 else :
1163 raise TypeError
1164
1166
1177
1193
1195
1196
1197
1198
1199
1200 del self.data[index]
1201
1203 """Add another sequence or string to this sequence.
1204
1205 Returns a new MutableSeq object."""
1206 if hasattr(other, "alphabet") :
1207
1208 if not Alphabet._check_type_compatible([self.alphabet,
1209 other.alphabet]) :
1210 raise TypeError("Incompatable alphabets %s and %s" \
1211 % (repr(self.alphabet), repr(other.alphabet)))
1212
1213 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
1214 if isinstance(other, MutableSeq):
1215
1216
1217 return self.__class__(self.data + other.data, a)
1218 else :
1219 return self.__class__(str(self) + str(other), a)
1220 elif isinstance(other, basestring) :
1221
1222 return self.__class__(str(self) + str(other), self.alphabet)
1223 else :
1224 raise TypeError
1225
1227 if hasattr(other, "alphabet") :
1228
1229 if not Alphabet._check_type_compatible([self.alphabet,
1230 other.alphabet]) :
1231 raise TypeError("Incompatable alphabets %s and %s" \
1232 % (repr(self.alphabet), repr(other.alphabet)))
1233
1234 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
1235 if isinstance(other, MutableSeq):
1236
1237
1238 return self.__class__(other.data + self.data, a)
1239 else :
1240 return self.__class__(str(other) + str(self), a)
1241 elif isinstance(other, basestring) :
1242
1243 return self.__class__(str(other) + str(self), self.alphabet)
1244 else :
1245 raise TypeError
1246
1249
1252
1253 - def pop(self, i = (-1)):
1254 c = self.data[i]
1255 del self.data[i]
1256 return c
1257
1259 for i in range(len(self.data)):
1260 if self.data[i] == item:
1261 del self.data[i]
1262 return
1263 raise ValueError("MutableSeq.remove(x): x not in list")
1264
1265 - def count(self, sub, start=0, end=sys.maxint):
1266 """Non-overlapping count method, like that of a python string.
1267
1268 This behaves like the python string method of the same name,
1269 which does a non-overlapping count!
1270
1271 Returns an integer, the number of occurrences of substring
1272 argument sub in the (sub)sequence given by [start:end].
1273 Optional arguments start and end are interpreted as in slice
1274 notation.
1275
1276 Arguments:
1277 - sub - a string or another Seq object to look for
1278 - start - optional integer, slice start
1279 - end - optional integer, slice end
1280
1281 e.g.
1282
1283 >>> from Bio.Seq import MutableSeq
1284 >>> my_mseq = MutableSeq("AAAATGA")
1285 >>> print my_mseq.count("A")
1286 5
1287 >>> print my_mseq.count("ATG")
1288 1
1289 >>> print my_mseq.count(Seq("AT"))
1290 1
1291 >>> print my_mseq.count("AT", 2, -1)
1292 1
1293
1294 HOWEVER, please note because that python strings, Seq objects and
1295 MutableSeq objects do a non-overlapping search, this may not give
1296 the answer you expect:
1297
1298 >>> "AAAA".count("AA")
1299 2
1300 >>> print MutableSeq("AAAA").count("AA")
1301 2
1302
1303 A non-overlapping search would give the answer as three!
1304 """
1305 try :
1306
1307 search = sub.tostring()
1308 except AttributeError :
1309 search = sub
1310
1311 if not isinstance(search, basestring) :
1312 raise TypeError("expected a string, Seq or MutableSeq")
1313
1314 if len(search) == 1 :
1315
1316 count = 0
1317 for c in self.data[start:end]:
1318 if c == search: count += 1
1319 return count
1320 else :
1321
1322 return self.tostring().count(search, start, end)
1323
1325 for i in range(len(self.data)):
1326 if self.data[i] == item:
1327 return i
1328 raise ValueError("MutableSeq.index(x): x not in list")
1329
1331 """Modify the mutable sequence to reverse itself.
1332
1333 No return value.
1334 """
1335 self.data.reverse()
1336
1362
1364 """Modify the mutable sequence to take on its reverse complement.
1365
1366 Trying to reverse complement a protein sequence raises an exception.
1367
1368 No return value.
1369 """
1370 self.complement()
1371 self.data.reverse()
1372
1373
1374
1375
1383
1385 """Returns the full sequence as a python string.
1386
1387 Although not formally deprecated, you are now encouraged to use
1388 str(my_seq) instead of my_seq.tostring().
1389
1390 Because str(my_seq) will give you the full sequence as a python string,
1391 there is often no need to make an explicit conversion. For example,
1392
1393 print "ID={%s}, sequence={%s}" % (my_name, my_seq)
1394
1395 On Biopython 1.44 or older you would have to have done this:
1396
1397 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
1398 """
1399 return "".join(self.data)
1400
1402 """Returns the full sequence as a new immutable Seq object.
1403
1404 >>> from Bio.Seq import Seq
1405 >>> from Bio.Alphabet import IUPAC
1406 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \
1407 IUPAC.protein)
1408 >>> my_mseq
1409 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1410 >>> my_mseq.toseq()
1411 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1412
1413 Note that the alphabet is preserved.
1414 """
1415 return Seq("".join(self.data), self.alphabet)
1416
1417
1418
1419
1420
1422 """Transcribes a DNA sequence into RNA.
1423
1424 If given a string, returns a new string object.
1425
1426 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1427
1428 Trying to transcribe a protein or RNA sequence raises an exception.
1429
1430 e.g.
1431
1432 >>> transcribe("ACTGN")
1433 'ACUGN'
1434 """
1435 if isinstance(dna, Seq) :
1436 return dna.transcribe()
1437 elif isinstance(dna, MutableSeq):
1438 return dna.toseq().transcribe()
1439 else:
1440 return dna.replace('T','U').replace('t','u')
1441
1443 """Back-transcribes an RNA sequence into DNA.
1444
1445 If given a string, returns a new string object.
1446
1447 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1448
1449 Trying to transcribe a protein or DNA sequence raises an exception.
1450
1451 e.g.
1452
1453 >>> back_transcribe("ACUGN")
1454 'ACTGN'
1455 """
1456 if isinstance(rna, Seq) :
1457 return rna.back_transcribe()
1458 elif isinstance(rna, MutableSeq):
1459 return rna.toseq().back_transcribe()
1460 else:
1461 return rna.replace('U','T').replace('u','t')
1462
1465 """Helper function to translate a nucleotide string (PRIVATE).
1466
1467 Arguments:
1468 - sequence - a string
1469 - table - a CodonTable object (NOT a table name or id number)
1470 - stop_symbol - a single character string, what to use for terminators.
1471 - to_stop - boolean, should translation terminate at the first
1472 in frame stop codon? If there is no in-frame stop codon
1473 then translation continues to the end.
1474 - pos_stop - a single character string for a possible stop codon
1475 (e.g. TAN or NNN)
1476
1477 Returns a string.
1478
1479 e.g.
1480
1481 >>> from Bio.Data import CodonTable
1482 >>> table = CodonTable.ambiguous_dna_by_id[1]
1483 >>> _translate_str("AAA", table)
1484 'K'
1485 >>> _translate_str("TAR", table)
1486 '*'
1487 >>> _translate_str("TAN", table)
1488 'X'
1489 >>> _translate_str("TAN", table, pos_stop="@")
1490 '@'
1491 >>> _translate_str("TA?", table)
1492 Traceback (most recent call last):
1493 ...
1494 TranslationError: Codon 'TA?' is invalid
1495 """
1496 sequence = sequence.upper()
1497 amino_acids = []
1498 forward_table = table.forward_table
1499 stop_codons = table.stop_codons
1500 if table.nucleotide_alphabet.letters is not None :
1501 valid_letters = set(table.nucleotide_alphabet.letters.upper())
1502 else :
1503
1504 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \
1505 IUPAC.ambiguous_rna.letters.upper())
1506
1507 n = len(sequence)
1508 for i in xrange(0,n-n%3,3) :
1509 codon = sequence[i:i+3]
1510 try :
1511 amino_acids.append(forward_table[codon])
1512 except (KeyError, CodonTable.TranslationError) :
1513
1514 if codon in table.stop_codons :
1515 if to_stop : break
1516 amino_acids.append(stop_symbol)
1517 elif valid_letters.issuperset(set(codon)) :
1518
1519 amino_acids.append(pos_stop)
1520 else :
1521 raise CodonTable.TranslationError(\
1522 "Codon '%s' is invalid" % codon)
1523 return "".join(amino_acids)
1524
1525 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False):
1526 """Translate a nucleotide sequence into amino acids.
1527
1528 If given a string, returns a new string object. Given a Seq or
1529 MutableSeq, returns a Seq object with a protein alphabet.
1530
1531 Arguments:
1532 - table - Which codon table to use? This can be either a name
1533 (string) or an NCBI identifier (integer). Defaults
1534 to the "Standard" table.
1535 - stop_symbol - Single character string, what to use for any
1536 terminators, defaults to the asterisk, "*".
1537 - to_stop - Boolean, defaults to False meaning do a full
1538 translation continuing on past any stop codons
1539 (translated as the specified stop_symbol). If
1540 True, translation is terminated at the first in
1541 frame stop codon (and the stop_symbol is not
1542 appended to the returned protein sequence).
1543
1544 A simple string example using the default (standard) genetic code:
1545
1546 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
1547 >>> translate(coding_dna)
1548 'VAIVMGR*KGAR*'
1549 >>> translate(coding_dna, stop_symbol="@")
1550 'VAIVMGR@KGAR@'
1551 >>> translate(coding_dna, to_stop=True)
1552 'VAIVMGR'
1553
1554 Now using NCBI table 2, where TGA is not a stop codon:
1555
1556 >>> translate(coding_dna, table=2)
1557 'VAIVMGRWKGAR*'
1558 >>> translate(coding_dna, table=2, to_stop=True)
1559 'VAIVMGRWKGAR'
1560
1561 Note that if the sequence has no in-frame stop codon, then the to_stop
1562 argument has no effect:
1563
1564 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
1565 >>> translate(coding_dna2)
1566 'VAIVMGR'
1567 >>> translate(coding_dna2, to_stop=True)
1568 'VAIVMGR'
1569
1570 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
1571 or a stop codon. These are translated as "X". Any invalid codon
1572 (e.g. "TA?" or "T-A") will throw a TranslationError.
1573
1574 NOTE - Does NOT support gapped sequences.
1575
1576 It will however translate either DNA or RNA.
1577 """
1578 if isinstance(sequence, Seq) :
1579 return sequence.translate(table, stop_symbol, to_stop)
1580 elif isinstance(sequence, MutableSeq):
1581
1582 return sequence.toseq().translate(table, stop_symbol, to_stop)
1583 else:
1584
1585 try :
1586 codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
1587 except ValueError :
1588 codon_table = CodonTable.ambiguous_generic_by_name[table]
1589 return _translate_str(sequence, codon_table, stop_symbol, to_stop)
1590
1624
1626 """Run the Bio.Seq module's doctests."""
1627 print "Runing doctests..."
1628 import doctest
1629 doctest.testmod()
1630 print "Done"
1631
1632 if __name__ == "__main__":
1633 _test()
1634