Package Bio :: Package Data :: Module CodonTable
[hide private]
[frames] | no frames]

Source Code for Module Bio.Data.CodonTable

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4   
  5  from Bio import Alphabet 
  6  from Bio.Alphabet import IUPAC 
  7  from Bio.Data import IUPACData 
  8   
  9  unambiguous_dna_by_name = {} 
 10  unambiguous_dna_by_id = {} 
 11  unambiguous_rna_by_name = {} 
 12  unambiguous_rna_by_id = {} 
 13  generic_by_name = {} # unambiguous DNA or RNA 
 14  generic_by_id = {} # unambiguous DNA or RNA 
 15  ambiguous_generic_by_name = {} # ambiguous DNA or RNA 
 16  ambiguous_generic_by_id = {} # ambiguous DNA or RNA  
 17   
 18  # standard IUPAC unambiguous codons 
 19  standard_dna_table = None 
 20  standard_rna_table = None 
 21   
 22  # In the future, the back_table could return a statistically 
 23  # appropriate distribution of codons, so do not cache the results of 
 24  # back_table lookups! 
 25   
26 -class TranslationError(Exception):
27 pass
28
29 -class CodonTable:
30 nucleotide_alphabet = Alphabet.generic_nucleotide 31 protein_alphabet = Alphabet.generic_protein 32 33 forward_table = {} # only includes codons which actually code 34 back_table = {} # for back translations 35 start_codons = [] 36 stop_codons = [] 37 # Not always called from derived classes!
38 - def __init__(self, nucleotide_alphabet = nucleotide_alphabet, 39 protein_alphabet = protein_alphabet, 40 forward_table = forward_table, back_table = back_table, 41 start_codons = start_codons, stop_codons = stop_codons):
48
49 - def __str__(self) :
50 """Returns a simple text representation of the codon table 51 52 e.g. 53 >>> import Bio.Data.CodonTable 54 >>> print Bio.Data.CodonTable.standard_dna_table 55 >>> print Bio.Data.CodonTable.generic_by_id[1]""" 56 57 if self.id : 58 answer = "Table %i" % self.id 59 else : 60 answer = "Table ID unknown" 61 if self.names : 62 answer += " " + ", ".join(filter(None, self.names)) 63 64 #Use the main four letters (and the conventional ordering) 65 #even for ambiguous tables 66 letters = self.nucleotide_alphabet.letters 67 if isinstance(self.nucleotide_alphabet, Alphabet.DNAAlphabet) \ 68 or (letters is not None and "T" in letters) : 69 letters = "TCAG" 70 else : 71 #Should be either RNA or generic nucleotides, 72 #e.g. Bio.Data.CodonTable.generic_by_id[1] 73 letters = "UCAG" 74 75 #Build the table... 76 answer=answer + "\n\n |" + "|".join( \ 77 [" %s " % c2 for c2 in letters] \ 78 ) + "|" 79 answer=answer + "\n--+" \ 80 + "+".join(["---------" for c2 in letters]) + "+--" 81 for c1 in letters : 82 for c3 in letters : 83 line = c1 + " |" 84 for c2 in letters : 85 codon = c1+c2+c3 86 line = line + " %s" % codon 87 if codon in self.stop_codons : 88 line = line + " Stop|" 89 else : 90 try : 91 amino = self.forward_table[codon] 92 except KeyError : 93 amino = "?" 94 except TranslationError : 95 amino = "?" 96 if codon in self.start_codons : 97 line = line + " %s(s)|" % amino 98 else : 99 line = line + " %s |" % amino 100 line = line + " " + c3 101 answer = answer + "\n"+ line 102 answer=answer + "\n--+" \ 103 + "+".join(["---------" for c2 in letters]) + "+--" 104 return answer
105
106 -def make_back_table(table, default_stop_codon):
107 # ONLY RETURNS A SINGLE CODON 108 # Do the sort so changes in the hash implementation won't affect 109 # the result when one amino acid is coded by more than one codon. 110 back_table = {} 111 keys = table.keys() ; keys.sort() 112 for key in keys: 113 back_table[table[key]] = key 114 back_table[None] = default_stop_codon 115 return back_table
116 117
118 -class NCBICodonTable(CodonTable):
119 nucleotide_alphabet = Alphabet.generic_nucleotide 120 protein_alphabet = IUPAC.protein 121
122 - def __init__(self, id, names, table, start_codons, stop_codons):
123 self.id = id 124 self.names = names 125 self.forward_table = table 126 self.back_table = make_back_table(table, stop_codons[0]) 127 self.start_codons = start_codons 128 self.stop_codons = stop_codons
129 130
131 -class NCBICodonTableDNA(NCBICodonTable):
132 nucleotide_alphabet = IUPAC.unambiguous_dna
133
134 -class NCBICodonTableRNA(NCBICodonTable):
135 nucleotide_alphabet = IUPAC.unambiguous_rna
136 137 138
139 -def register_ncbi_table(name, alt_name, id, 140 table, start_codons, stop_codons):
141 names = name.split("; ") 142 143 dna = NCBICodonTableDNA(id, names + [alt_name], table, start_codons, 144 stop_codons) 145 # replace all T's with U's for the RNA tables 146 rna_table = {} 147 generic_table = {} 148 for codon, val in table.items(): 149 generic_table[codon] = val 150 codon = codon.replace("T", "U") 151 generic_table[codon] = val 152 rna_table[codon] = val 153 rna_start_codons = [] 154 generic_start_codons = [] 155 for codon in start_codons: 156 generic_start_codons.append(codon) 157 codon = codon.replace("T", "U") 158 generic_start_codons.append(codon) 159 rna_start_codons.append(codon) 160 rna_stop_codons = [] 161 generic_stop_codons = [] 162 for codon in stop_codons: 163 generic_stop_codons.append(codon) 164 codon = codon.replace("T", "U") 165 generic_stop_codons.append(codon) 166 rna_stop_codons.append(codon) 167 168 generic = NCBICodonTable(id, names + [alt_name], generic_table, 169 generic_start_codons, generic_stop_codons) 170 rna = NCBICodonTableRNA(id, names + [alt_name], rna_table, 171 rna_start_codons, rna_stop_codons) 172 173 if id == 1: 174 global standard_dna_table, standard_rna_table 175 standard_dna_table = dna 176 standard_rna_table = rna 177 178 unambiguous_dna_by_id[id] = dna 179 unambiguous_rna_by_id[id] = rna 180 generic_by_id[id] = generic 181 182 if alt_name is not None: 183 names.append(alt_name) 184 185 for name in names: 186 unambiguous_dna_by_name[name] = dna 187 unambiguous_rna_by_name[name] = rna 188 generic_by_name[name] = generic
189 190 ### These tables created from the data file 191 ### ftp://ncbi.nlm.nih.gov/entrez/misc/data/gc.prt 192 ### using the following: 193 ##import re 194 ##for line in open("gc.prt").readlines(): 195 ## if line[:2] == " {": 196 ## names = [] 197 ## id = None 198 ## aa = None 199 ## start = None 200 ## bases = [] 201 ## elif line[:6] == " name": 202 ## names.append(re.search('"([^"]*)"', line).group(1)) 203 ## elif line[:8] == " name": 204 ## names.append(re.search('"(.*)$', line).group(1)) 205 ## elif line == ' Mitochondrial; Mycoplasma; Spiroplasma" ,\n': 206 ## names[-1] = names[-1] + " Mitochondrial; Mycoplasma; Spiroplasma" 207 ## elif line[:4] == " id": 208 ## id = int(re.search('(\d+)', line).group(1)) 209 ## elif line[:10] == " ncbieaa ": 210 ## aa = line[12:12+64] 211 ## elif line[:10] == " sncbieaa": 212 ## start = line[12:12+64] 213 ## elif line[:9] == " -- Base": 214 ## bases.append(line[12:12+64]) 215 ## elif line[:2] == " }": 216 ## assert names != [] and id is not None and aa is not None 217 ## assert start is not None and bases != [] 218 ## if len(names) == 1: 219 ## names.append(None) 220 ## print "register_ncbi_table(name = %s," % repr(names[0]) 221 ## print " alt_name = %s, id = %d", % \ 222 ## (repr(names[1]), id) 223 ## print " table = {" 224 ## s = " " 225 ## for i in range(64): 226 ## if aa[i] != "*": 227 ## t = " '%s%s%s': '%s'," % (bases[0][i], bases[1][i], 228 ## bases[2][i], aa[i]) 229 ## if len(s) + len(t) > 75: 230 ## print s 231 ## s = " " + t 232 ## else: 233 ## s = s + t 234 ## print s, "}," 235 236 ## s = " stop_codons = [" 237 ## for i in range(64): 238 ## if aa[i] == "*": 239 ## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i]) 240 ## if len(s) + len(t) > 75: 241 ## print s 242 ## s = " " + t 243 ## else: 244 ## s = s + t 245 ## print s, "]," 246 247 ## s = " start_codons = [" 248 ## for i in range(64): 249 ## if start[i] == "M": 250 ## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i]) 251 ## if len(s) + len(t) > 75: 252 ## print s 253 ## s = " " + t 254 ## else: 255 ## s = s + t 256 ## print s, "]" 257 ## print " )" 258 ## elif line[:2] == "--" or line == "\n" or line == "}\n" or \ 259 ## line == 'Genetic-code-table ::= {\n': 260 ## pass 261 ## else: 262 ## raise Exception("Unparsed: " + repr(line)) 263 264 register_ncbi_table(name = 'Standard', 265 alt_name = 'SGC0', id = 1, 266 table = { 267 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 268 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 269 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 270 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 271 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 272 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 273 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 274 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 275 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 276 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 277 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 278 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 279 'GGG': 'G', }, 280 stop_codons = [ 'TAA', 'TAG', 'TGA', ], 281 start_codons = [ 'TTG', 'CTG', 'ATG', ] 282 ) 283 register_ncbi_table(name = 'Vertebrate Mitochondrial', 284 alt_name = 'SGC1', id = 2, 285 table = { 286 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 287 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 288 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 289 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 290 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 291 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 292 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 293 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 294 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'GTT': 'V', 295 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 296 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 297 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, 298 stop_codons = [ 'TAA', 'TAG', 'AGA', 'AGG', ], 299 start_codons = [ 'ATT', 'ATC', 'ATA', 'ATG', 'GTG', ] 300 ) 301 register_ncbi_table(name = 'Yeast Mitochondrial', 302 alt_name = 'SGC2', id = 3, 303 table = { 304 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 305 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 306 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'T', 307 'CTC': 'T', 'CTA': 'T', 'CTG': 'T', 'CCT': 'P', 'CCC': 'P', 308 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 309 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 310 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 311 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 312 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 313 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 314 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 315 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 316 'GGA': 'G', 'GGG': 'G', }, 317 stop_codons = [ 'TAA', 'TAG', ], 318 start_codons = [ 'ATG', ] 319 ) 320 register_ncbi_table(name = 'Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma', 321 alt_name = 'SGC3', id = 4, 322 table = { 323 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 324 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 325 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 326 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 327 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 328 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 329 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 330 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 331 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 332 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 333 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 334 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 335 'GGA': 'G', 'GGG': 'G', }, 336 stop_codons = [ 'TAA', 'TAG', ], 337 start_codons = [ 'TTA', 'TTG', 'CTG', 'ATT', 'ATC', 338 'ATA', 'ATG', 'GTG', ] 339 ) 340 register_ncbi_table(name = 'Invertebrate Mitochondrial', 341 alt_name = 'SGC4', id = 5, 342 table = { 343 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 344 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 345 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 346 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 347 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 348 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 349 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 350 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 351 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S', 352 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 353 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 354 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 355 'GGA': 'G', 'GGG': 'G', }, 356 stop_codons = [ 'TAA', 'TAG', ], 357 start_codons = [ 'TTG', 'ATT', 'ATC', 'ATA', 'ATG', 358 'GTG', ] 359 ) 360 register_ncbi_table(name = 'Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear', 361 alt_name = 'SGC5', id = 6, 362 table = { 363 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 364 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 365 'TAA': 'Q', 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 366 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 367 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 368 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 369 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 370 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 371 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 372 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 373 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 374 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 375 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, 376 stop_codons = [ 'TGA', ], 377 start_codons = [ 'ATG', ] 378 ) 379 register_ncbi_table(name = 'Echinoderm Mitochondrial', 380 alt_name = 'SGC8', id = 9, 381 table = { 382 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 383 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 384 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 385 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 386 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 387 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 388 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 389 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 390 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S', 391 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 392 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 393 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 394 'GGA': 'G', 'GGG': 'G', }, 395 stop_codons = [ 'TAA', 'TAG', ], 396 start_codons = [ 'ATG', ] 397 ) 398 register_ncbi_table(name = 'Euplotid Nuclear', 399 alt_name = 'SGC9', id = 10, 400 table = { 401 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 402 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 403 'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W', 'CTT': 'L', 404 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 405 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 406 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 407 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 408 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 409 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 410 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 411 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 412 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 413 'GGA': 'G', 'GGG': 'G', }, 414 stop_codons = [ 'TAA', 'TAG', ], 415 start_codons = [ 'ATG', ] 416 ) 417 register_ncbi_table(name = 'Bacterial', 418 alt_name = None, id = 11, 419 table = { 420 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 421 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 422 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 423 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 424 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 425 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 426 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 427 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 428 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 429 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 430 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 431 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 432 'GGG': 'G', }, 433 stop_codons = [ 'TAA', 'TAG', 'TGA', ], 434 start_codons = [ 'TTG', 'CTG', 'ATT', 'ATC', 'ATA', 435 'ATG', 'GTG', ] 436 ) 437 register_ncbi_table(name = 'Alternative Yeast Nuclear', 438 alt_name = None, id = 12, 439 table = { 440 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 441 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 442 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 443 'CTA': 'L', 'CTG': 'S', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 444 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 445 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 446 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 447 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 448 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 449 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 450 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 451 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 452 'GGG': 'G', }, 453 stop_codons = [ 'TAA', 'TAG', 'TGA', ], 454 start_codons = [ 'CTG', 'ATG', ] 455 ) 456 register_ncbi_table(name = 'Ascidian Mitochondrial', 457 alt_name = None, id = 13, 458 table = { 459 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 460 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 461 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 462 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 463 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 464 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 465 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 466 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 467 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'G', 468 'AGG': 'G', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 469 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 470 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 471 'GGA': 'G', 'GGG': 'G', }, 472 stop_codons = [ 'TAA', 'TAG', ], 473 start_codons = [ 'ATG', ] 474 ) 475 register_ncbi_table(name = 'Flatworm Mitochondrial', 476 alt_name = None, id = 14, 477 table = { 478 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 479 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 480 'TAA': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 481 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 482 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 483 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 484 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 485 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 486 'AAC': 'N', 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 487 'AGA': 'S', 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 488 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 489 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 490 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, 491 stop_codons = [ 'TAG', ], 492 start_codons = [ 'ATG', ] 493 ) 494 register_ncbi_table(name = 'Blepharisma Macronuclear', 495 alt_name = None, id = 15, 496 table = { 497 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 498 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 499 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 500 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 501 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 502 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 503 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 504 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 505 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 506 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 507 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 508 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 509 'GGA': 'G', 'GGG': 'G', }, 510 stop_codons = [ 'TAA', 'TGA', ], 511 start_codons = [ 'ATG', ] 512 ) 513 514 ######### Deal with ambiguous forward translations 515
516 -class AmbiguousCodonTable(CodonTable):
517 - def __init__(self, codon_table, 518 ambiguous_nucleotide_alphabet, 519 ambiguous_nucleotide_values, 520 ambiguous_protein_alphabet, 521 ambiguous_protein_values):
522 CodonTable.__init__(self, 523 ambiguous_nucleotide_alphabet, 524 ambiguous_protein_alphabet, 525 AmbiguousForwardTable(codon_table.forward_table, 526 ambiguous_nucleotide_values, 527 ambiguous_protein_values), 528 codon_table.back_table, 529 530 # These two are WRONG! I need to get the 531 # list of ambiguous codons which code for 532 # the stop codons XXX 533 list_ambiguous_codons(codon_table.start_codons, ambiguous_nucleotide_values), 534 list_ambiguous_codons(codon_table.stop_codons, ambiguous_nucleotide_values) 535 ) 536 self._codon_table = codon_table
537 538 # Be sneaky and forward attribute lookups to the original table. 539 # This lets us get the names, if the original table is an NCBI 540 # table.
541 - def __getattr__(self, name):
542 return getattr(self._codon_table, name)
543
544 -def list_possible_proteins(codon, forward_table, ambiguous_nucleotide_values):
545 c1, c2, c3 = codon 546 x1 = ambiguous_nucleotide_values[c1] 547 x2 = ambiguous_nucleotide_values[c2] 548 x3 = ambiguous_nucleotide_values[c3] 549 possible = {} 550 stops = [] 551 for y1 in x1: 552 for y2 in x2: 553 for y3 in x3: 554 try: 555 possible[forward_table[y1+y2+y3]] = 1 556 except KeyError: 557 # If tripping over a stop codon 558 stops.append(y1+y2+y3) 559 if stops: 560 if possible.keys(): 561 raise TranslationError("ambiguous codon '%s' codes " % codon \ 562 + "for both proteins and stop codons") 563 # This is a true stop codon - tell the caller about it 564 raise KeyError(codon) 565 return possible.keys()
566
567 -def list_ambiguous_codons(codons, ambiguous_nucleotide_values):
568 """Extends a codon list to include all possible ambigous codons. 569 570 e.g. ['TAG', 'TAA'] -> ['TAG', 'TAA', 'TAR'] 571 ['UAG', 'UGA'] -> ['UAG', 'UGA', 'URA'] 572 573 Note that ['TAG', 'TGA'] -> ['TAG', 'TGA'], this does not add 'TRR'. 574 Thus only two more codons are added in the following: 575 576 e.g. ['TGA', 'TAA', 'TAG'] -> ['TGA', 'TAA', 'TAG', 'TRA', 'TAR'] 577 578 Returns a new (longer) list of codon strings. 579 """ 580 581 #Note ambiguous_nucleotide_values['R'] = 'AG' (etc) 582 #This will generate things like 'TRR' from ['TAG', 'TGA'], which 583 #we don't want to include: 584 c1_list = [letter for (letter, meanings) \ 585 in ambiguous_nucleotide_values.iteritems() \ 586 if set([codon[0] for codon in codons]).issuperset(set(meanings))] 587 c2_list = [letter for (letter, meanings) \ 588 in ambiguous_nucleotide_values.iteritems() \ 589 if set([codon[1] for codon in codons]).issuperset(set(meanings))] 590 c3_list = [letter for (letter, meanings) \ 591 in ambiguous_nucleotide_values.iteritems() \ 592 if set([codon[2] for codon in codons]).issuperset(set(meanings))] 593 #candidates is a list (not a set) to preserve the iteration order 594 candidates = [] 595 for c1 in c1_list : 596 for c2 in c2_list : 597 for c3 in c3_list : 598 codon = c1+c2+c3 599 if codon not in candidates and codon not in codons : 600 candidates.append(codon) 601 answer = codons[:] #copy 602 #print "Have %i new candidates" % len(candidates) 603 for ambig_codon in candidates : 604 wanted = True 605 #e.g. 'TRR' -> 'TAA', 'TAG', 'TGA', 'TGG' 606 for codon in [c1+c2+c3 \ 607 for c1 in ambiguous_nucleotide_values[ambig_codon[0]] \ 608 for c2 in ambiguous_nucleotide_values[ambig_codon[1]] \ 609 for c3 in ambiguous_nucleotide_values[ambig_codon[2]]]: 610 if codon not in codons : 611 #This ambiguous codon can code for a non-stop, exclude it! 612 wanted=False 613 #print "Rejecting %s" % ambig_codon 614 continue 615 if wanted : 616 answer.append(ambig_codon) 617 return answer
618 619 assert list_ambiguous_codons(['TGA', 'TAA'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TRA'] 620 assert list_ambiguous_codons(['TAG', 'TGA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TGA'] 621 assert list_ambiguous_codons(['TAG', 'TAA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TAA', 'TAR'] 622 assert list_ambiguous_codons(['UAG', 'UAA'],IUPACData.ambiguous_rna_values) == ['UAG', 'UAA', 'UAR'] 623 assert list_ambiguous_codons(['TGA', 'TAA', 'TAG'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TAG', 'TAR', 'TRA'] 624 625 # Forward translation is "onto", that is, any given codon always maps 626 # to the same protein, or it doesn't map at all. Thus, I can build 627 # off of an existing table to produce the ambiguous mappings. 628 # 629 # This handles the general case. Perhaps it's overkill? 630 # >>> t = CodonTable.ambiguous_dna_by_id[1] 631 # >>> t.forward_table["AAT"] 632 # 'N' 633 # >>> t.forward_table["GAT"] 634 # 'D' 635 # >>> t.forward_table["RAT"] 636 # 'B' 637 # >>> t.forward_table["YTA"] 638 # 'L' 639
640 -class AmbiguousForwardTable:
641 - def __init__(self, forward_table, ambiguous_nucleotide, ambiguous_protein):
642 self.forward_table = forward_table 643 644 self.ambiguous_nucleotide = ambiguous_nucleotide 645 self.ambiguous_protein = ambiguous_protein 646 647 inverted = {} 648 for name, val in ambiguous_protein.items(): 649 for c in val: 650 x = inverted.get(c, {}) 651 x[name] = 1 652 inverted[c] = x 653 for name, val in inverted.items(): 654 inverted[name] = val.keys() 655 self._inverted = inverted 656 657 self._cache = {}
658
659 - def get(self, codon, failobj = None):
660 try: 661 return self.__getitem__(codon) 662 except KeyError: 663 return failobj
664
665 - def __getitem__(self, codon):
666 try: 667 x = self._cache[codon] 668 except KeyError: 669 pass 670 else: 671 if x is TranslationError: 672 raise TranslationError(codon) # no unique translation 673 if x is KeyError: 674 raise KeyError(codon) # it's a stop codon 675 return x 676 try: 677 x = self.forward_table[codon] 678 self._cache[codon] = x 679 return x 680 except KeyError: 681 pass 682 683 # XXX Need to make part of this into a method which returns 684 # a list of all possible encodings for a codon! 685 try: 686 possible = list_possible_proteins(codon, 687 self.forward_table, 688 self.ambiguous_nucleotide) 689 except KeyError: 690 self._cache[codon] = KeyError 691 raise KeyError(codon) # stop codon 692 except TranslationError: 693 self._cache[codon] = TranslationError 694 raise TranslationError(codon) # does not code 695 assert len(possible) > 0, "unambiguous codons must code" 696 697 # Hah! Only one possible protein, so use it 698 if len(possible) == 1: 699 self._cache[codon] = possible[0] 700 return possible[0] 701 702 # See if there's an ambiguous protein encoding for the multiples. 703 # Find residues which exist in every coding set. 704 ambiguous_possible = {} 705 for amino in possible: 706 for term in self._inverted[amino]: 707 ambiguous_possible[term] = ambiguous_possible.get(term, 0) + 1 708 709 n = len(possible) 710 possible = [] 711 for amino, val in ambiguous_possible.items(): 712 if val == n: 713 possible.append(amino) 714 715 # No amino acid encoding for the results 716 if len(possible) == 0: 717 self._cache[codon] = TranslationError 718 raise TranslationError(codon) # no valid translation 719 720 # All of these are valid, so choose one 721 # To be unique, sort by smallet ambiguity then alphabetically 722 # Can get this if "X" encodes for everything. 723 def _sort(x, y, table = self.ambiguous_protein): 724 a = cmp(len(table[x]), len(table[y])) 725 if a == 0: 726 return cmp(x, y) 727 return a
728 possible.sort(_sort) 729 730 x = possible[0] 731 self._cache[codon] = x 732 return x
733 734 #Prepare the ambiguous tables for DNA, RNA and Generic (DNA or RNA) 735 ambiguous_dna_by_name = {} 736 for key, val in unambiguous_dna_by_name.items(): 737 ambiguous_dna_by_name[key] = AmbiguousCodonTable(val, 738 IUPAC.ambiguous_dna, 739 IUPACData.ambiguous_dna_values, 740 IUPAC.extended_protein, 741 IUPACData.extended_protein_values) 742 ambiguous_dna_by_id = {} 743 for key, val in unambiguous_dna_by_id.items(): 744 ambiguous_dna_by_id[key] = AmbiguousCodonTable(val, 745 IUPAC.ambiguous_dna, 746 IUPACData.ambiguous_dna_values, 747 IUPAC.extended_protein, 748 IUPACData.extended_protein_values) 749 750 ambiguous_rna_by_name = {} 751 for key, val in unambiguous_rna_by_name.items(): 752 ambiguous_rna_by_name[key] = AmbiguousCodonTable(val, 753 IUPAC.ambiguous_rna, 754 IUPACData.ambiguous_rna_values, 755 IUPAC.extended_protein, 756 IUPACData.extended_protein_values) 757 ambiguous_rna_by_id = {} 758 for key, val in unambiguous_rna_by_id.items(): 759 ambiguous_rna_by_id[key] = AmbiguousCodonTable(val, 760 IUPAC.ambiguous_rna, 761 IUPACData.ambiguous_rna_values, 762 IUPAC.extended_protein, 763 IUPACData.extended_protein_values) 764 765 #The following isn't very elegant, but seems to work nicely. 766 _merged_values = dict(IUPACData.ambiguous_rna_values.iteritems()) 767 _merged_values["T"] = "U" 768 769 for key, val in generic_by_name.items(): 770 ambiguous_generic_by_name[key] = AmbiguousCodonTable(val, 771 Alphabet.NucleotideAlphabet(), 772 _merged_values, 773 IUPAC.extended_protein, 774 IUPACData.extended_protein_values) 775 776 for key, val in generic_by_id.items(): 777 ambiguous_generic_by_id[key] = AmbiguousCodonTable(val, 778 Alphabet.NucleotideAlphabet(), 779 _merged_values, 780 IUPAC.extended_protein, 781 IUPACData.extended_protein_values) 782 del _merged_values 783 del key, val 784 785 #Basic sanity test, 786 for n in ambiguous_generic_by_id.keys() : 787 assert ambiguous_rna_by_id[n].forward_table["GUU"] == "V" 788 assert ambiguous_rna_by_id[n].forward_table["GUN"] == "V" 789 assert ambiguous_rna_by_id[n].forward_table["UUN"] == "X" #F or L 790 #R = A or G, so URR = UAA or UGA / TRA = TAA or TGA = stop codons 791 if "UAA" in unambiguous_rna_by_id[n].stop_codons \ 792 and "UGA" in unambiguous_rna_by_id[n].stop_codons : 793 try : 794 print ambiguous_dna_by_id[n].forward_table["TRA"] 795 assert False, "Should be a stop only" 796 except KeyError : 797 pass 798 assert "URA" in ambiguous_generic_by_id[n].stop_codons 799 assert "URA" in ambiguous_rna_by_id[n].stop_codons 800 assert "TRA" in ambiguous_generic_by_id[n].stop_codons 801 assert "TRA" in ambiguous_dna_by_id[n].stop_codons 802 del n 803 assert ambiguous_generic_by_id[1].stop_codons == ambiguous_generic_by_name["Standard"].stop_codons 804 assert ambiguous_generic_by_id[4].stop_codons == ambiguous_generic_by_name["SGC3"].stop_codons 805 assert ambiguous_generic_by_id[15].stop_codons == ambiguous_generic_by_name['Blepharisma Macronuclear'].stop_codons 806