Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # All rights reserved. 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9   
 10  """Miscellaneous functions for dealing with sequences.""" 
 11   
 12  import re, time 
 13  from Bio import SeqIO 
 14  from Bio.Seq import Seq 
 15  from Bio import Alphabet 
 16  from Bio.Alphabet import IUPAC 
 17  from Bio.Data import IUPACData, CodonTable 
 18   
 19   
 20  ###################################### 
 21  # DNA 
 22  ###################### 
 23  # {{{  
 24   
25 -def reverse(seq):
26 """Reverse the sequence. Works on string sequences. 27 28 e.g. 29 >>> reverse("ACGGT") 30 'TGGCA' 31 32 """ 33 r = list(seq) 34 r.reverse() 35 return ''.join(r)
36
37 -def GC(seq):
38 """Calculates G+C content, returns the percentage (float between 0 and 100). 39 40 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C) 41 when counting the G and C content. The percentage is calculated against 42 the full length, e.g.: 43 44 >>> from Bio.SeqUtils import GC 45 >>> GC("ACTGN") 46 40.0 47 48 Note that this will return zero for an empty sequence. 49 """ 50 try : 51 gc = sum(map(seq.count,['G','C','g','c','S','s'])) 52 return gc*100.0/len(seq) 53 except ZeroDivisionError : 54 return 0.0
55 56
57 -def GC123(seq):
58 """Calculates total G+C content plus first, second and third positions. 59 60 Returns a tuple of four floats (percentages between 0 and 100) for the 61 entire sequence, and the three codon positions. e.g. 62 63 >>> from Bio.SeqUtils import GC123 64 >>> GC123("ACTGTN") 65 (40.0, 50.0, 50.0, 0.0) 66 67 Copes with mixed case sequences, but does NOT deal with ambiguous 68 nucleotides. 69 """ 70 d= {} 71 for nt in ['A','T','G','C']: 72 d[nt] = [0,0,0] 73 74 for i in range(0,len(seq),3): 75 codon = seq[i:i+3] 76 if len(codon) <3: codon += ' ' 77 for pos in range(0,3): 78 for nt in ['A','T','G','C']: 79 if codon[pos] == nt or codon[pos] == nt.lower(): 80 d[nt][pos] += 1 81 gc = {} 82 gcall = 0 83 nall = 0 84 for i in range(0,3): 85 try: 86 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 87 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 88 except: 89 gc[i] = 0 90 91 gcall = gcall + d['G'][i] + d['C'][i] 92 nall = nall + n 93 94 gcall = 100.0*gcall/nall 95 return gcall, gc[0], gc[1], gc[2]
96
97 -def GC_skew(seq, window = 100):
98 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence. 99 100 Returns a list of ratios (floats), controlled by the length of the sequence 101 and the size of the window. 102 103 Does NOT look at any ambiguous nucleotides. 104 """ 105 # 8/19/03: Iddo: added lowercase 106 values = [] 107 for i in range(0, len(seq), window): 108 s = seq[i: i + window] 109 g = s.count('G') + s.count('g') 110 c = s.count('C') + s.count('c') 111 skew = (g-c)/float(g+c) 112 values.append(skew) 113 return values
114 115 from math import pi, sin, cos, log
116 -def xGC_skew(seq, window = 1000, zoom = 100, 117 r = 300, px = 100, py = 100):
118 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 119 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \ 120 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y 121 yscroll = Scrollbar(orient = VERTICAL) 122 xscroll = Scrollbar(orient = HORIZONTAL) 123 canvas = Canvas(yscrollcommand = yscroll.set, 124 xscrollcommand = xscroll.set, background = 'white') 125 win = canvas.winfo_toplevel() 126 win.geometry('700x700') 127 128 yscroll.config(command = canvas.yview) 129 xscroll.config(command = canvas.xview) 130 yscroll.pack(side = RIGHT, fill = Y) 131 xscroll.pack(side = BOTTOM, fill = X) 132 canvas.pack(fill=BOTH, side = LEFT, expand = 1) 133 canvas.update() 134 135 X0, Y0 = r + px, r + py 136 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r 137 138 ty = Y0 139 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 140 ty +=20 141 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq))) 142 ty +=20 143 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue') 144 ty +=20 145 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta') 146 ty +=20 147 canvas.create_oval(x1,y1, x2, y2) 148 149 acc = 0 150 start = 0 151 for gc in GC_skew(seq, window): 152 r1 = r 153 acc+=gc 154 # GC skew 155 alpha = pi - (2*pi*start)/len(seq) 156 r2 = r1 - gc*zoom 157 x1 = X0 + r1 * sin(alpha) 158 y1 = Y0 + r1 * cos(alpha) 159 x2 = X0 + r2 * sin(alpha) 160 y2 = Y0 + r2 * cos(alpha) 161 canvas.create_line(x1,y1,x2,y2, fill = 'blue') 162 # accumulated GC skew 163 r1 = r - 50 164 r2 = r1 - acc 165 x1 = X0 + r1 * sin(alpha) 166 y1 = Y0 + r1 * cos(alpha) 167 x2 = X0 + r2 * sin(alpha) 168 y2 = Y0 + r2 * cos(alpha) 169 canvas.create_line(x1,y1,x2,y2, fill = 'magenta') 170 171 canvas.update() 172 start += window 173 174 canvas.configure(scrollregion = canvas.bbox(ALL))
175
176 -def molecular_weight(seq):
177 """Calculate the molecular weight of a DNA sequence.""" 178 if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna) 179 weight_table = IUPACData.unambiguous_dna_weights 180 return sum(weight_table[x] for x in seq)
181
182 -def nt_search(seq, subseq):
183 """Search for a DNA subseq in sequence. 184 185 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 186 searches only on forward strand 187 """ 188 pattern = '' 189 for nt in subseq: 190 value = IUPACData.ambiguous_dna_values[nt] 191 if len(value) == 1: 192 pattern += value 193 else: 194 pattern += '[%s]' % value 195 196 pos = -1 197 result = [pattern] 198 l = len(seq) 199 while True: 200 pos+=1 201 s = seq[pos:] 202 m = re.search(pattern, s) 203 if not m: break 204 pos += int(m.start(0)) 205 result.append(pos) 206 return result
207 208 # }}} 209 210 ###################################### 211 # Protein 212 ###################### 213 # {{{ 214 215 # temporary hack for exception free translation of "dirty" DNA 216 # should be moved to ??? 217
218 -class ProteinX(Alphabet.ProteinAlphabet):
219 letters = IUPACData.extended_protein_letters + "X"
220 221 proteinX = ProteinX() 222
223 -class MissingTable:
224 - def __init__(self, table):
225 self._table = table
226 - def get(self, codon, stop_symbol):
227 try: 228 return self._table.get(codon, stop_symbol) 229 except CodonTable.TranslationError: 230 return 'X'
231
232 -def makeTableX(table):
233 assert table.protein_alphabet == IUPAC.extended_protein 234 return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX, 235 MissingTable(table.forward_table), 236 table.back_table, table.start_codons, 237 table.stop_codons)
238 239 # end of hacks 240
241 -def seq3(seq):
242 """Turn a one letter code protein sequence into one with three letter codes. 243 244 The single input argument 'seq' should be a protein sequence using single 245 letter codes, either as a python string or as a Seq or MutableSeq object. 246 247 This function returns the amino acid sequence as a string using the three 248 letter amino acid codes. Output follows the IUPAC standard (including 249 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 250 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown 251 character (including possible gap characters), is changed into 'Xaa'. 252 253 e.g. 254 >>> from Bio.SeqUtils import seq3 255 >>> seq3("MAIVMGRWKGAR*") 256 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 257 258 This function was inspired by BioPerl's seq3. 259 """ 260 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', 261 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', 262 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', 263 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', 264 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', 265 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter', 266 'U':'Sel', 'O':'Pyl', 'J':'Xle', 267 } 268 #We use a default of 'Xaa' for undefined letters 269 #Note this will map '-' to 'Xaa' which may be undesirable! 270 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
271 272 273 # }}} 274 275 ###################################### 276 # Mixed ??? 277 ###################### 278 # {{{ 279
280 -def translate(seq, frame = 1, genetic_code = 1, translator = None):
281 """Translation of DNA in one of the six different reading frames (DEPRECATED). 282 283 Use the Bio.Seq.Translate function, or the Seq object's translate method 284 instead: 285 286 >>> from Bio.Seq import Seq 287 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG") 288 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUA") 289 >>> for frame in [0,1,2] : 290 ... print my_seq[frame:].translate() 291 ... 292 MAIVMGR*KGAR* 293 WPL*WAAERVPDS 294 GHCNGPLKGCPIV 295 >>> for frame in [0,1,2] : 296 ... print my_seq.reverse_complement()[frame:].translate() 297 ... 298 YYRAPFQRPITMA 299 TIGHPFSGPLQWP 300 LSGTLSAAHYNGH 301 """ 302 import warnings 303 warnings.warn("Bio.SeqUtils.translate() has been deprecated, and we intend" \ 304 +" to remove it in a future release of Biopython. Please use"\ 305 +" the method or function in Bio.Seq instead, as described in"\ 306 +" the Tutorial.", DeprecationWarning) 307 from Bio import Translate #This will also trigger a DeprecationWarning 308 309 if frame not in [1,2,3,-1,-2,-3]: 310 raise ValueError('invalid frame') 311 312 if not translator: 313 table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code]) 314 translator = Translate.Translator(table) 315 316 #Does this frame calculation do something sensible? No RC taken! 317 return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
318
319 -def GC_Frame(seq, genetic_code = 1):
320 """Just an alias for six_frame_translations (OBSOLETE). 321 322 Use six_frame_translation directly, as this function may be deprecated 323 in a future release.""" 324 return six_frame_translations(seq, genetic_code)
325
326 -def six_frame_translations(seq, genetic_code = 1):
327 """Formatted string showing the 6 frame translations and GC content. 328 329 nice looking 6 frame translation with GC content - code from xbbtools 330 similar to DNA Striders six-frame translation 331 332 e.g. 333 from Bio.SeqUtils import six_frame_translations 334 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA") 335 """ 336 from Bio.Seq import reverse_complement, translate 337 anti = reverse_complement(seq) 338 comp = anti[::-1] 339 length = len(seq) 340 frames = {} 341 for i in range(0,3): 342 frames[i+1] = translate(seq[i:], genetic_code) 343 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code)) 344 345 # create header 346 if length > 20: 347 short = '%s ... %s' % (seq[:10], seq[-10:]) 348 else: 349 short = seq 350 #TODO? Remove the date as this would spoil any unit test... 351 date = time.strftime('%y %b %d, %X', time.localtime(time.time())) 352 header = 'GC_Frame: %s, ' % date 353 for nt in ['a','t','g','c']: 354 header += '%s:%d ' % (nt, seq.count(nt.upper())) 355 356 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq)) 357 res = header 358 359 for i in range(0,length,60): 360 subseq = seq[i:i+60] 361 csubseq = comp[i:i+60] 362 p = i/3 363 res = res + '%d/%d\n' % (i+1, i/3+1) 364 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n' 365 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n' 366 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n' 367 # seq 368 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq)) 369 res = res + csubseq.lower() + '\n' 370 # - frames 371 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n' 372 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n' 373 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n' 374 return res
375 376 # }}} 377 378 ###################################### 379 # FASTA file utilities 380 ###################### 381 # {{{ 382
383 -def fasta_uniqids(file):
384 """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE). 385 386 file - a FASTA format filename to read in. 387 388 No return value, the output is written to screen. 389 """ 390 dict = {} 391 txt = open(file).read() 392 entries = [] 393 for entry in txt.split('>')[1:]: 394 name, seq= entry.split('\n',1) 395 name = name.split()[0].split(',')[0] 396 397 if name in dict: 398 n = 1 399 while 1: 400 n = n + 1 401 _name = name + str(n) 402 if _name not in dict: 403 name = _name 404 break 405 406 dict[name] = seq 407 408 for name, seq in dict.items(): 409 print '>%s\n%s' % (name, seq)
410
411 -def quick_FASTA_reader(file):
412 """Simple FASTA reader, returning a list of string tuples. 413 414 The single argument 'file' should be the filename of a FASTA format file. 415 This function will open and read in the entire file, constructing a list 416 of all the records, each held as a tuple of strings (the sequence name or 417 title, and its sequence). 418 419 This function was originally intended for use on large files, where its 420 low overhead makes it very fast. However, because it returns the data as 421 a single in memory list, this can require a lot of RAM on large files. 422 423 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which 424 allows you to iterate over the records one by one (avoiding having all the 425 records in memory at once). Using Bio.SeqIO also makes it easy to switch 426 between different input file formats. However, please note that rather 427 than simple strings, Bio.SeqIO uses SeqRecord objects for each record. 428 """ 429 #Want to split on "\n>" not just ">" in case there are any extra ">" 430 #in the name/description. So, in order to make sure we also split on 431 #the first entry, prepend a "\n" to the start of the file. 432 handle = open(file) 433 txt = "\n" + handle.read() 434 handle.close() 435 entries = [] 436 for entry in txt.split('\n>')[1:]: 437 name,seq= entry.split('\n',1) 438 seq = seq.replace('\n','').replace(' ','').upper() 439 entries.append((name, seq)) 440 return entries
441
442 -def apply_on_multi_fasta(file, function, *args):
443 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). 444 445 file - filename of a FASTA format file 446 function - the function you wish to invoke on each record 447 *args - any extra arguments you want passed to the function 448 449 This function will iterate over each record in a FASTA file as SeqRecord 450 objects, calling your function with the record (and supplied args) as 451 arguments. 452 453 This function returns a list. For those records where your function 454 returns a value, this is taken as a sequence and used to construct a 455 FASTA format string. If your function never has a return value, this 456 means apply_on_multi_fasta will return an empty list. 457 """ 458 try: 459 f = globals()[function] 460 except: 461 raise NotImplementedError("%s not implemented" % function) 462 463 handle = open(file, 'r') 464 records = SeqIO.parse(handle, "fasta") 465 results = [] 466 for record in records: 467 arguments = [record.sequence] 468 for arg in args: arguments.append(arg) 469 result = f(*arguments) 470 if result: 471 results.append('>%s\n%s' % (record.name, result)) 472 handle.close() 473 return results
474
475 -def quicker_apply_on_multi_fasta(file, function, *args):
476 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). 477 478 file - filename of a FASTA format file 479 function - the function you wish to invoke on each record 480 *args - any extra arguments you want passed to the function 481 482 This function will use quick_FASTA_reader to load every record in the 483 FASTA file into memory as a list of tuples. For each record, it will 484 call your supplied function with the record as a tuple of the name and 485 sequence as strings (plus any supplied args). 486 487 This function returns a list. For those records where your function 488 returns a value, this is taken as a sequence and used to construct a 489 FASTA format string. If your function never has a return value, this 490 means quicker_apply_on_multi_fasta will return an empty list. 491 """ 492 try: 493 f = globals()[function] 494 except: 495 raise NotImplementedError("%s not implemented" % function) 496 497 entries = quick_FASTA_reader(file) 498 results = [] 499 for name, seq in entries: 500 arguments = [seq] 501 for arg in args: arguments.append(arg) 502 result = f(*arguments) 503 if result: 504 results.append('>%s\n%s' % (name, result)) 505 handle.close() 506 return results
507 508 # }}} 509 510 ###################################### 511 # Main 512 ##################### 513 # {{{ 514 515 if __name__ == '__main__': 516 import sys, getopt 517 # crude command line options to use most functions directly on a FASTA file 518 options = {'apply_on_multi_fasta':0, 519 'quick':0, 520 'uniq_ids':0, 521 } 522 523 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=', 524 'help', 'quick', 'uniq_ids', 'search=']) 525 for arg in optlist: 526 if arg[0] in ['-h', '--help']: 527 pass 528 elif arg[0] in ['--describe']: 529 # get all new functions from this file 530 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)] 531 mol_funcs.sort() 532 print 'available functions:' 533 for f in mol_funcs: print '\t--%s' % f 534 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas' 535 536 sys.exit(0) 537 elif arg[0] in ['--apply_on_multi_fasta']: 538 options['apply_on_multi_fasta'] = arg[1] 539 elif arg[0] in ['--search']: 540 options['search'] = arg[1] 541 else: 542 key = re.search('-*(.+)', arg[0]).group(1) 543 options[key] = 1 544 545 546 if options.get('apply_on_multi_fasta'): 547 file = args[0] 548 function = options['apply_on_multi_fasta'] 549 arguments = [] 550 if options.get('search'): 551 arguments = options['search'] 552 if function == 'xGC_skew': 553 arguments = 1000 554 if options.get('quick'): 555 results = quicker_apply_on_multi_fasta(file, function, arguments) 556 else: 557 results = apply_on_multi_fasta(file, function, arguments) 558 for result in results: print result 559 560 elif options.get('uniq_ids'): 561 file = args[0] 562 fasta_uniqids(file) 563 564 # }}} 565