Package Bio :: Package Data :: Module CodonTable
[hide private]
[frames] | no frames]

Source Code for Module Bio.Data.CodonTable

  1  #TODO - Remove this work around once we drop python 2.3 support 
  2  try: 
  3     set = set 
  4  except NameError: 
  5     from sets import Set as set 
  6   
  7  from Bio import Alphabet 
  8  from Bio.Alphabet import IUPAC 
  9  from Bio.Data import IUPACData 
 10   
 11  unambiguous_dna_by_name = {} 
 12  unambiguous_dna_by_id = {} 
 13  unambiguous_rna_by_name = {} 
 14  unambiguous_rna_by_id = {} 
 15  generic_by_name = {} # unambiguous DNA or RNA 
 16  generic_by_id = {} # unambiguous DNA or RNA 
 17  ambiguous_generic_by_name = {} # ambiguous DNA or RNA 
 18  ambiguous_generic_by_id = {} # ambiguous DNA or RNA  
 19   
 20  # standard IUPAC unambiguous codons 
 21  standard_dna_table = None 
 22  standard_rna_table = None 
 23   
 24  # In the future, the back_table could return a statistically 
 25  # appropriate distribution of codons, so do not cache the results of 
 26  # back_table lookups! 
 27   
28 -class TranslationError(Exception):
29 pass
30
31 -class CodonTable:
32 nucleotide_alphabet = Alphabet.generic_nucleotide 33 protein_alphabet = Alphabet.generic_protein 34 35 forward_table = {} # only includes codons which actually code 36 back_table = {} # for back translations 37 start_codons = [] 38 stop_codons = [] 39 # Not always called from derived classes!
40 - def __init__(self, nucleotide_alphabet = nucleotide_alphabet, 41 protein_alphabet = protein_alphabet, 42 forward_table = forward_table, back_table = back_table, 43 start_codons = start_codons, stop_codons = stop_codons):
50
51 - def __str__(self) :
52 """Returns a simple text representation of the codon table 53 54 e.g. 55 >>> import Bio.Data.CodonTable 56 >>> print Bio.Data.CodonTable.standard_dna_table 57 >>> print Bio.Data.CodonTable.generic_by_id[1]""" 58 59 if self.id : 60 answer = "Table %i" % self.id 61 else : 62 answer = "Table ID unknown" 63 if self.names : 64 answer += " " + ", ".join(filter(None, self.names)) 65 66 #Use the main four letters (and the conventional ordering) 67 #even for ambiguous tables 68 letters = self.nucleotide_alphabet.letters 69 if isinstance(self.nucleotide_alphabet, Alphabet.DNAAlphabet) \ 70 or (letters is not None and "T" in letters) : 71 letters = "TCAG" 72 else : 73 #Should be either RNA or generic nucleotides, 74 #e.g. Bio.Data.CodonTable.generic_by_id[1] 75 letters = "UCAG" 76 77 #Build the table... 78 answer=answer + "\n\n |" + "|".join( \ 79 [" %s " % c2 for c2 in letters] \ 80 ) + "|" 81 answer=answer + "\n--+" \ 82 + "+".join(["---------" for c2 in letters]) + "+--" 83 for c1 in letters : 84 for c3 in letters : 85 line = c1 + " |" 86 for c2 in letters : 87 codon = c1+c2+c3 88 line = line + " %s" % codon 89 if codon in self.stop_codons : 90 line = line + " Stop|" 91 else : 92 try : 93 amino = self.forward_table[codon] 94 except KeyError : 95 amino = "?" 96 except TranslationError : 97 amino = "?" 98 if codon in self.start_codons : 99 line = line + " %s(s)|" % amino 100 else : 101 line = line + " %s |" % amino 102 line = line + " " + c3 103 answer = answer + "\n"+ line 104 answer=answer + "\n--+" \ 105 + "+".join(["---------" for c2 in letters]) + "+--" 106 return answer
107
108 -def make_back_table(table, default_stop_codon):
109 # ONLY RETURNS A SINGLE CODON 110 # Do the sort so changes in the hash implementation won't affect 111 # the result when one amino acid is coded by more than one codon. 112 back_table = {} 113 keys = table.keys() ; keys.sort() 114 for key in keys: 115 back_table[table[key]] = key 116 back_table[None] = default_stop_codon 117 return back_table
118 119
120 -class NCBICodonTable(CodonTable):
121 nucleotide_alphabet = Alphabet.generic_nucleotide 122 protein_alphabet = IUPAC.protein 123
124 - def __init__(self, id, names, table, start_codons, stop_codons):
125 self.id = id 126 self.names = names 127 self.forward_table = table 128 self.back_table = make_back_table(table, stop_codons[0]) 129 self.start_codons = start_codons 130 self.stop_codons = stop_codons
131 132
133 -class NCBICodonTableDNA(NCBICodonTable):
134 nucleotide_alphabet = IUPAC.unambiguous_dna
135
136 -class NCBICodonTableRNA(NCBICodonTable):
137 nucleotide_alphabet = IUPAC.unambiguous_rna
138 139 140
141 -def register_ncbi_table(name, alt_name, id, 142 table, start_codons, stop_codons):
143 names = name.split("; ") 144 145 dna = NCBICodonTableDNA(id, names + [alt_name], table, start_codons, 146 stop_codons) 147 # replace all T's with U's for the RNA tables 148 rna_table = {} 149 generic_table = {} 150 for codon, val in table.items(): 151 generic_table[codon] = val 152 codon = codon.replace("T", "U") 153 generic_table[codon] = val 154 rna_table[codon] = val 155 rna_start_codons = [] 156 generic_start_codons = [] 157 for codon in start_codons: 158 generic_start_codons.append(codon) 159 codon = codon.replace("T", "U") 160 generic_start_codons.append(codon) 161 rna_start_codons.append(codon) 162 rna_stop_codons = [] 163 generic_stop_codons = [] 164 for codon in stop_codons: 165 generic_stop_codons.append(codon) 166 codon = codon.replace("T", "U") 167 generic_stop_codons.append(codon) 168 rna_stop_codons.append(codon) 169 170 generic = NCBICodonTable(id, names + [alt_name], generic_table, 171 generic_start_codons, generic_stop_codons) 172 rna = NCBICodonTableRNA(id, names + [alt_name], rna_table, 173 rna_start_codons, rna_stop_codons) 174 175 if id == 1: 176 global standard_dna_table, standard_rna_table 177 standard_dna_table = dna 178 standard_rna_table = rna 179 180 unambiguous_dna_by_id[id] = dna 181 unambiguous_rna_by_id[id] = rna 182 generic_by_id[id] = generic 183 184 if alt_name is not None: 185 names.append(alt_name) 186 187 for name in names: 188 unambiguous_dna_by_name[name] = dna 189 unambiguous_rna_by_name[name] = rna 190 generic_by_name[name] = generic
191 192 ### These tables created from the data file 193 ### ftp://ncbi.nlm.nih.gov/entrez/misc/data/gc.prt 194 ### using the following: 195 ##import re 196 ##for line in open("gc.prt").readlines(): 197 ## if line[:2] == " {": 198 ## names = [] 199 ## id = None 200 ## aa = None 201 ## start = None 202 ## bases = [] 203 ## elif line[:6] == " name": 204 ## names.append(re.search('"([^"]*)"', line).group(1)) 205 ## elif line[:8] == " name": 206 ## names.append(re.search('"(.*)$', line).group(1)) 207 ## elif line == ' Mitochondrial; Mycoplasma; Spiroplasma" ,\n': 208 ## names[-1] = names[-1] + " Mitochondrial; Mycoplasma; Spiroplasma" 209 ## elif line[:4] == " id": 210 ## id = int(re.search('(\d+)', line).group(1)) 211 ## elif line[:10] == " ncbieaa ": 212 ## aa = line[12:12+64] 213 ## elif line[:10] == " sncbieaa": 214 ## start = line[12:12+64] 215 ## elif line[:9] == " -- Base": 216 ## bases.append(line[12:12+64]) 217 ## elif line[:2] == " }": 218 ## assert names != [] and id is not None and aa is not None 219 ## assert start is not None and bases != [] 220 ## if len(names) == 1: 221 ## names.append(None) 222 ## print "register_ncbi_table(name = %s," % repr(names[0]) 223 ## print " alt_name = %s, id = %d", % \ 224 ## (repr(names[1]), id) 225 ## print " table = {" 226 ## s = " " 227 ## for i in range(64): 228 ## if aa[i] != "*": 229 ## t = " '%s%s%s': '%s'," % (bases[0][i], bases[1][i], 230 ## bases[2][i], aa[i]) 231 ## if len(s) + len(t) > 75: 232 ## print s 233 ## s = " " + t 234 ## else: 235 ## s = s + t 236 ## print s, "}," 237 238 ## s = " stop_codons = [" 239 ## for i in range(64): 240 ## if aa[i] == "*": 241 ## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i]) 242 ## if len(s) + len(t) > 75: 243 ## print s 244 ## s = " " + t 245 ## else: 246 ## s = s + t 247 ## print s, "]," 248 249 ## s = " start_codons = [" 250 ## for i in range(64): 251 ## if start[i] == "M": 252 ## t = " '%s%s%s'," % (bases[0][i], bases[1][i], bases[2][i]) 253 ## if len(s) + len(t) > 75: 254 ## print s 255 ## s = " " + t 256 ## else: 257 ## s = s + t 258 ## print s, "]" 259 ## print " )" 260 ## elif line[:2] == "--" or line == "\n" or line == "}\n" or \ 261 ## line == 'Genetic-code-table ::= {\n': 262 ## pass 263 ## else: 264 ## raise Exception("Unparsed: " + repr(line)) 265 266 register_ncbi_table(name = 'Standard', 267 alt_name = 'SGC0', id = 1, 268 table = { 269 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 270 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 271 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 272 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 273 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 274 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 275 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 276 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 277 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 278 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 279 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 280 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 281 'GGG': 'G', }, 282 stop_codons = [ 'TAA', 'TAG', 'TGA', ], 283 start_codons = [ 'TTG', 'CTG', 'ATG', ] 284 ) 285 register_ncbi_table(name = 'Vertebrate Mitochondrial', 286 alt_name = 'SGC1', id = 2, 287 table = { 288 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 289 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 290 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 291 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 292 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 293 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 294 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 295 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 296 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'GTT': 'V', 297 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 298 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 299 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, 300 stop_codons = [ 'TAA', 'TAG', 'AGA', 'AGG', ], 301 start_codons = [ 'ATT', 'ATC', 'ATA', 'ATG', 'GTG', ] 302 ) 303 register_ncbi_table(name = 'Yeast Mitochondrial', 304 alt_name = 'SGC2', id = 3, 305 table = { 306 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 307 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 308 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'T', 309 'CTC': 'T', 'CTA': 'T', 'CTG': 'T', 'CCT': 'P', 'CCC': 'P', 310 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 311 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 312 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 313 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 314 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 315 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 316 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 317 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 318 'GGA': 'G', 'GGG': 'G', }, 319 stop_codons = [ 'TAA', 'TAG', ], 320 start_codons = [ 'ATG', ] 321 ) 322 register_ncbi_table(name = 'Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma', 323 alt_name = 'SGC3', id = 4, 324 table = { 325 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 326 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 327 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 328 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 329 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 330 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 331 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 332 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 333 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 334 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 335 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 336 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 337 'GGA': 'G', 'GGG': 'G', }, 338 stop_codons = [ 'TAA', 'TAG', ], 339 start_codons = [ 'TTA', 'TTG', 'CTG', 'ATT', 'ATC', 340 'ATA', 'ATG', 'GTG', ] 341 ) 342 register_ncbi_table(name = 'Invertebrate Mitochondrial', 343 alt_name = 'SGC4', id = 5, 344 table = { 345 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 346 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 347 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 348 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 349 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 350 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 351 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 352 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 353 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S', 354 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 355 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 356 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 357 'GGA': 'G', 'GGG': 'G', }, 358 stop_codons = [ 'TAA', 'TAG', ], 359 start_codons = [ 'TTG', 'ATT', 'ATC', 'ATA', 'ATG', 360 'GTG', ] 361 ) 362 register_ncbi_table(name = 'Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear', 363 alt_name = 'SGC5', id = 6, 364 table = { 365 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 366 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 367 'TAA': 'Q', 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 368 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 369 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 370 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 371 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 372 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 373 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 374 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 375 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 376 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 377 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, 378 stop_codons = [ 'TGA', ], 379 start_codons = [ 'ATG', ] 380 ) 381 register_ncbi_table(name = 'Echinoderm Mitochondrial', 382 alt_name = 'SGC8', id = 9, 383 table = { 384 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 385 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 386 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 387 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 388 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 389 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 390 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 391 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 392 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'S', 393 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 394 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 395 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 396 'GGA': 'G', 'GGG': 'G', }, 397 stop_codons = [ 'TAA', 'TAG', ], 398 start_codons = [ 'ATG', ] 399 ) 400 register_ncbi_table(name = 'Euplotid Nuclear', 401 alt_name = 'SGC9', id = 10, 402 table = { 403 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 404 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 405 'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W', 'CTT': 'L', 406 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 407 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 408 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 409 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 410 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 411 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 412 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 413 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 414 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 415 'GGA': 'G', 'GGG': 'G', }, 416 stop_codons = [ 'TAA', 'TAG', ], 417 start_codons = [ 'ATG', ] 418 ) 419 register_ncbi_table(name = 'Bacterial', 420 alt_name = None, id = 11, 421 table = { 422 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 423 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 424 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 425 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 426 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 427 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 428 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 429 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 430 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 431 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 432 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 433 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 434 'GGG': 'G', }, 435 stop_codons = [ 'TAA', 'TAG', 'TGA', ], 436 start_codons = [ 'TTG', 'CTG', 'ATT', 'ATC', 'ATA', 437 'ATG', 'GTG', ] 438 ) 439 register_ncbi_table(name = 'Alternative Yeast Nuclear', 440 alt_name = None, id = 12, 441 table = { 442 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 443 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 444 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 445 'CTA': 'L', 'CTG': 'S', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 446 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 447 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 448 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 449 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 450 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 451 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 452 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 453 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 454 'GGG': 'G', }, 455 stop_codons = [ 'TAA', 'TAG', 'TGA', ], 456 start_codons = [ 'CTG', 'ATG', ] 457 ) 458 register_ncbi_table(name = 'Ascidian Mitochondrial', 459 alt_name = None, id = 13, 460 table = { 461 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 462 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 463 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 'CTT': 'L', 464 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 465 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 466 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 467 'ATT': 'I', 'ATC': 'I', 'ATA': 'M', 'ATG': 'M', 'ACT': 'T', 468 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 469 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'G', 470 'AGG': 'G', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 471 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 472 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 473 'GGA': 'G', 'GGG': 'G', }, 474 stop_codons = [ 'TAA', 'TAG', ], 475 start_codons = [ 'ATG', ] 476 ) 477 register_ncbi_table(name = 'Flatworm Mitochondrial', 478 alt_name = None, id = 14, 479 table = { 480 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 481 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 482 'TAA': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', 483 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 484 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 485 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 486 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 487 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 488 'AAC': 'N', 'AAA': 'N', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 489 'AGA': 'S', 'AGG': 'S', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 490 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 491 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 492 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', }, 493 stop_codons = [ 'TAG', ], 494 start_codons = [ 'ATG', ] 495 ) 496 register_ncbi_table(name = 'Blepharisma Macronuclear', 497 alt_name = None, id = 15, 498 table = { 499 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 500 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 501 'TAG': 'Q', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 502 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 503 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 504 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 505 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 506 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 507 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 508 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 509 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 510 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 511 'GGA': 'G', 'GGG': 'G', }, 512 stop_codons = [ 'TAA', 'TGA', ], 513 start_codons = [ 'ATG', ] 514 ) 515 516 ######### Deal with ambiguous forward translations 517
518 -class AmbiguousCodonTable(CodonTable):
519 - def __init__(self, codon_table, 520 ambiguous_nucleotide_alphabet, 521 ambiguous_nucleotide_values, 522 ambiguous_protein_alphabet, 523 ambiguous_protein_values):
524 CodonTable.__init__(self, 525 ambiguous_nucleotide_alphabet, 526 ambiguous_protein_alphabet, 527 AmbiguousForwardTable(codon_table.forward_table, 528 ambiguous_nucleotide_values, 529 ambiguous_protein_values), 530 codon_table.back_table, 531 532 # These two are WRONG! I need to get the 533 # list of ambiguous codons which code for 534 # the stop codons XXX 535 list_ambiguous_codons(codon_table.start_codons, ambiguous_nucleotide_values), 536 list_ambiguous_codons(codon_table.stop_codons, ambiguous_nucleotide_values) 537 ) 538 self._codon_table = codon_table
539 540 # Be sneaky and forward attribute lookups to the original table. 541 # This lets us get the names, if the original table is an NCBI 542 # table.
543 - def __getattr__(self, name):
544 return getattr(self._codon_table, name)
545
546 -def list_possible_proteins(codon, forward_table, ambiguous_nucleotide_values):
547 c1, c2, c3 = codon 548 x1 = ambiguous_nucleotide_values[c1] 549 x2 = ambiguous_nucleotide_values[c2] 550 x3 = ambiguous_nucleotide_values[c3] 551 possible = {} 552 stops = [] 553 for y1 in x1: 554 for y2 in x2: 555 for y3 in x3: 556 try: 557 possible[forward_table[y1+y2+y3]] = 1 558 except KeyError: 559 # If tripping over a stop codon 560 stops.append(y1+y2+y3) 561 if stops: 562 if possible.keys(): 563 raise TranslationError("ambiguous codon '%s' codes " % codon \ 564 + "for both proteins and stop codons") 565 # This is a true stop codon - tell the caller about it 566 raise KeyError(codon) 567 return possible.keys()
568
569 -def list_ambiguous_codons(codons, ambiguous_nucleotide_values):
570 """Extends a codon list to include all possible ambigous codons. 571 572 e.g. ['TAG', 'TAA'] -> ['TAG', 'TAA', 'TAR'] 573 ['UAG', 'UGA'] -> ['UAG', 'UGA', 'URA'] 574 575 Note that ['TAG', 'TGA'] -> ['TAG', 'TGA'], this does not add 'TRR'. 576 Thus only two more codons are added in the following: 577 578 e.g. ['TGA', 'TAA', 'TAG'] -> ['TGA', 'TAA', 'TAG', 'TRA', 'TAR'] 579 580 Returns a new (longer) list of codon strings. 581 """ 582 583 #Note ambiguous_nucleotide_values['R'] = 'AG' (etc) 584 #This will generate things like 'TRR' from ['TAG', 'TGA'], which 585 #we don't want to include: 586 c1_list = [letter for (letter, meanings) \ 587 in ambiguous_nucleotide_values.iteritems() \ 588 if set([codon[0] for codon in codons]).issuperset(set(meanings))] 589 c2_list = [letter for (letter, meanings) \ 590 in ambiguous_nucleotide_values.iteritems() \ 591 if set([codon[1] for codon in codons]).issuperset(set(meanings))] 592 c3_list = [letter for (letter, meanings) \ 593 in ambiguous_nucleotide_values.iteritems() \ 594 if set([codon[2] for codon in codons]).issuperset(set(meanings))] 595 set2 = set([codon[1] for codon in codons]) 596 set3 = set([codon[2] for codon in codons]) 597 candidates = set([c1+c2+c3 for c1 in c1_list for c2 in c2_list for c3 in c3_list]) 598 candidates.difference_update(codons) 599 answer = codons[:] #copy 600 #print "Have %i new candidates" % len(candidates) 601 for ambig_codon in candidates : 602 wanted = True 603 #e.g. 'TRR' -> 'TAA', 'TAG', 'TGA', 'TGG' 604 for codon in [c1+c2+c3 \ 605 for c1 in ambiguous_nucleotide_values[ambig_codon[0]] \ 606 for c2 in ambiguous_nucleotide_values[ambig_codon[1]] \ 607 for c3 in ambiguous_nucleotide_values[ambig_codon[2]]]: 608 if codon not in codons : 609 #This ambiguous codon can code for a non-stop, exclude it! 610 wanted=False 611 #print "Rejecting %s" % ambig_codon 612 continue 613 if wanted : 614 answer.append(ambig_codon) 615 return answer
616 assert list_ambiguous_codons(['TGA', 'TAA'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TRA'] 617 assert list_ambiguous_codons(['TAG', 'TGA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TGA'] 618 assert list_ambiguous_codons(['TAG', 'TAA'],IUPACData.ambiguous_dna_values) == ['TAG', 'TAA', 'TAR'] 619 assert list_ambiguous_codons(['UAG', 'UAA'],IUPACData.ambiguous_rna_values) == ['UAG', 'UAA', 'UAR'] 620 assert list_ambiguous_codons(['TGA', 'TAA', 'TAG'],IUPACData.ambiguous_dna_values) == ['TGA', 'TAA', 'TAG', 'TAR', 'TRA'] 621 622 # Forward translation is "onto", that is, any given codon always maps 623 # to the same protein, or it doesn't map at all. Thus, I can build 624 # off of an existing table to produce the ambiguous mappings. 625 # 626 # This handles the general case. Perhaps it's overkill? 627 # >>> t = CodonTable.ambiguous_dna_by_id[1] 628 # >>> t.forward_table["AAT"] 629 # 'N' 630 # >>> t.forward_table["GAT"] 631 # 'D' 632 # >>> t.forward_table["RAT"] 633 # 'B' 634 # >>> t.forward_table["YTA"] 635 # 'L' 636
637 -class AmbiguousForwardTable:
638 - def __init__(self, forward_table, ambiguous_nucleotide, ambiguous_protein):
639 self.forward_table = forward_table 640 641 self.ambiguous_nucleotide = ambiguous_nucleotide 642 self.ambiguous_protein = ambiguous_protein 643 644 inverted = {} 645 for name, val in ambiguous_protein.items(): 646 for c in val: 647 x = inverted.get(c, {}) 648 x[name] = 1 649 inverted[c] = x 650 for name, val in inverted.items(): 651 inverted[name] = val.keys() 652 self._inverted = inverted 653 654 self._cache = {}
655
656 - def get(self, codon, failobj = None):
657 try: 658 return self.__getitem__(codon) 659 except KeyError: 660 return failobj
661
662 - def __getitem__(self, codon):
663 try: 664 x = self._cache[codon] 665 except KeyError: 666 pass 667 else: 668 if x is TranslationError: 669 raise TranslationError(codon) # no unique translation 670 if x is KeyError: 671 raise KeyError(codon) # it's a stop codon 672 return x 673 try: 674 x = self.forward_table[codon] 675 self._cache[codon] = x 676 return x 677 except KeyError: 678 pass 679 680 # XXX Need to make part of this into a method which returns 681 # a list of all possible encodings for a codon! 682 try: 683 possible = list_possible_proteins(codon, 684 self.forward_table, 685 self.ambiguous_nucleotide) 686 except KeyError: 687 self._cache[codon] = KeyError 688 raise KeyError(codon) # stop codon 689 except TranslationError: 690 self._cache[codon] = TranslationError 691 raise TranslationError(codon) # does not code 692 assert len(possible) > 0, "unambiguous codons must code" 693 694 # Hah! Only one possible protein, so use it 695 if len(possible) == 1: 696 self._cache[codon] = possible[0] 697 return possible[0] 698 699 # See if there's an ambiguous protein encoding for the multiples. 700 # Find residues which exist in every coding set. 701 ambiguous_possible = {} 702 for amino in possible: 703 for term in self._inverted[amino]: 704 ambiguous_possible[term] = ambiguous_possible.get(term, 0) + 1 705 706 n = len(possible) 707 possible = [] 708 for amino, val in ambiguous_possible.items(): 709 if val == n: 710 possible.append(amino) 711 712 # No amino acid encoding for the results 713 if len(possible) == 0: 714 self._cache[codon] = TranslationError 715 raise TranslationError(codon) # no valid translation 716 717 # All of these are valid, so choose one 718 # To be unique, sort by smallet ambiguity then alphabetically 719 # Can get this if "X" encodes for everything. 720 def _sort(x, y, table = self.ambiguous_protein): 721 a = cmp(len(table[x]), len(table[y])) 722 if a == 0: 723 return cmp(x, y) 724 return a
725 possible.sort(_sort) 726 727 x = possible[0] 728 self._cache[codon] = x 729 return x
730 731 #Prepare the ambiguous tables for DNA, RNA and Generic (DNA or RNA) 732 ambiguous_dna_by_name = {} 733 for key, val in unambiguous_dna_by_name.items(): 734 ambiguous_dna_by_name[key] = AmbiguousCodonTable(val, 735 IUPAC.ambiguous_dna, 736 IUPACData.ambiguous_dna_values, 737 IUPAC.extended_protein, 738 IUPACData.extended_protein_values) 739 ambiguous_dna_by_id = {} 740 for key, val in unambiguous_dna_by_id.items(): 741 ambiguous_dna_by_id[key] = AmbiguousCodonTable(val, 742 IUPAC.ambiguous_dna, 743 IUPACData.ambiguous_dna_values, 744 IUPAC.extended_protein, 745 IUPACData.extended_protein_values) 746 747 ambiguous_rna_by_name = {} 748 for key, val in unambiguous_rna_by_name.items(): 749 ambiguous_rna_by_name[key] = AmbiguousCodonTable(val, 750 IUPAC.ambiguous_rna, 751 IUPACData.ambiguous_rna_values, 752 IUPAC.extended_protein, 753 IUPACData.extended_protein_values) 754 ambiguous_rna_by_id = {} 755 for key, val in unambiguous_rna_by_id.items(): 756 ambiguous_rna_by_id[key] = AmbiguousCodonTable(val, 757 IUPAC.ambiguous_rna, 758 IUPACData.ambiguous_rna_values, 759 IUPAC.extended_protein, 760 IUPACData.extended_protein_values) 761 762 #The following isn't very elegant, but seems to work nicely. 763 _merged_values = dict(IUPACData.ambiguous_rna_values.iteritems()) 764 _merged_values["T"] = "U" 765 766 for key, val in generic_by_name.items(): 767 ambiguous_generic_by_name[key] = AmbiguousCodonTable(val, 768 Alphabet.NucleotideAlphabet(), 769 _merged_values, 770 IUPAC.extended_protein, 771 IUPACData.extended_protein_values) 772 773 for key, val in generic_by_id.items(): 774 ambiguous_generic_by_id[key] = AmbiguousCodonTable(val, 775 Alphabet.NucleotideAlphabet(), 776 _merged_values, 777 IUPAC.extended_protein, 778 IUPACData.extended_protein_values) 779 del _merged_values 780 del key, val 781 782 #Basic sanity test, 783 for n in ambiguous_generic_by_id.keys() : 784 assert ambiguous_rna_by_id[n].forward_table["GUU"] == "V" 785 assert ambiguous_rna_by_id[n].forward_table["GUN"] == "V" 786 assert ambiguous_rna_by_id[n].forward_table["UUN"] == "X" #F or L 787 #R = A or G, so URR = UAA or UGA / TRA = TAA or TGA = stop codons 788 if "UAA" in unambiguous_rna_by_id[n].stop_codons \ 789 and "UGA" in unambiguous_rna_by_id[n].stop_codons : 790 try : 791 print ambiguous_dna_by_id[n].forward_table["TRA"] 792 assert False, "Should be a stop only" 793 except KeyError : 794 pass 795 assert "URA" in ambiguous_generic_by_id[n].stop_codons 796 assert "URA" in ambiguous_rna_by_id[n].stop_codons 797 assert "TRA" in ambiguous_generic_by_id[n].stop_codons 798 assert "TRA" in ambiguous_dna_by_id[n].stop_codons 799 del n 800 assert ambiguous_generic_by_id[1].stop_codons == ambiguous_generic_by_name["Standard"].stop_codons 801 assert ambiguous_generic_by_id[4].stop_codons == ambiguous_generic_by_name["SGC3"].stop_codons 802 assert ambiguous_generic_by_id[15].stop_codons == ambiguous_generic_by_name['Blepharisma Macronuclear'].stop_codons 803