Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # All rights reserved. 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9   
 10  """Miscellaneous functions for dealing with sequences.""" 
 11   
 12  import re, time 
 13  from Bio import SeqIO 
 14  from Bio import Translate 
 15  from Bio.Seq import Seq 
 16  from Bio import Alphabet 
 17  from Bio.Alphabet import IUPAC 
 18  from Bio.Data import IUPACData, CodonTable 
 19   
 20   
 21  ###################################### 
 22  # DNA 
 23  ###################### 
 24  # {{{  
 25   
26 -def reverse(seq):
27 """Reverse the sequence. Works on string sequences. 28 29 e.g. 30 >>> reverse("ACGGT") 31 'TGGCA' 32 33 """ 34 r = list(seq) 35 r.reverse() 36 return ''.join(r)
37
38 -def GC(seq):
39 """Calculates G+C content, returns the percentage (float between 0 and 100). 40 41 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C) 42 when counting the G and C content. The percentage is calculated against 43 the full length, e.g.: 44 45 >>> from Bio.SeqUtils import GC 46 >>> GC("ACTGN") 47 40.0 48 49 Note that this will return zero for an empty sequence. 50 """ 51 try : 52 gc = sum(map(seq.count,['G','C','g','c','S','s'])) 53 return gc*100.0/len(seq) 54 except ZeroDivisionError : 55 return 0.0
56 57
58 -def GC123(seq):
59 """Calculates total G+C content plus first, second and third positions. 60 61 Returns a tuple of four floats (percentages between 0 and 100) for the 62 entire sequence, and the three codon positions. e.g. 63 64 >>> from Bio.SeqUtils import GC123 65 >>> GC123("ACTGTN") 66 (40.0, 50.0, 50.0, 0.0) 67 68 Copes with mixed case sequences, but does NOT deal with ambiguous 69 nucleotides. 70 """ 71 d= {} 72 for nt in ['A','T','G','C']: 73 d[nt] = [0,0,0] 74 75 for i in range(0,len(seq),3): 76 codon = seq[i:i+3] 77 if len(codon) <3: codon += ' ' 78 for pos in range(0,3): 79 for nt in ['A','T','G','C']: 80 if codon[pos] == nt or codon[pos] == nt.lower(): 81 d[nt][pos] += 1 82 gc = {} 83 gcall = 0 84 nall = 0 85 for i in range(0,3): 86 try: 87 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 88 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 89 except: 90 gc[i] = 0 91 92 gcall = gcall + d['G'][i] + d['C'][i] 93 nall = nall + n 94 95 gcall = 100.0*gcall/nall 96 return gcall, gc[0], gc[1], gc[2]
97
98 -def GC_skew(seq, window = 100):
99 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence. 100 101 Returns a list of ratios (floats), controlled by the length of the sequence 102 and the size of the window. 103 104 Does NOT look at any ambiguous nucleotides. 105 """ 106 # 8/19/03: Iddo: added lowercase 107 values = [] 108 for i in range(0, len(seq), window): 109 s = seq[i: i + window] 110 g = s.count('G') + s.count('g') 111 c = s.count('C') + s.count('c') 112 skew = (g-c)/float(g+c) 113 values.append(skew) 114 return values
115 116 from math import pi, sin, cos, log
117 -def xGC_skew(seq, window = 1000, zoom = 100, 118 r = 300, px = 100, py = 100):
119 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 120 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \ 121 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y 122 yscroll = Scrollbar(orient = VERTICAL) 123 xscroll = Scrollbar(orient = HORIZONTAL) 124 canvas = Canvas(yscrollcommand = yscroll.set, 125 xscrollcommand = xscroll.set, background = 'white') 126 win = canvas.winfo_toplevel() 127 win.geometry('700x700') 128 129 yscroll.config(command = canvas.yview) 130 xscroll.config(command = canvas.xview) 131 yscroll.pack(side = RIGHT, fill = Y) 132 xscroll.pack(side = BOTTOM, fill = X) 133 canvas.pack(fill=BOTH, side = LEFT, expand = 1) 134 canvas.update() 135 136 X0, Y0 = r + px, r + py 137 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r 138 139 ty = Y0 140 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 141 ty +=20 142 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq))) 143 ty +=20 144 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue') 145 ty +=20 146 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta') 147 ty +=20 148 canvas.create_oval(x1,y1, x2, y2) 149 150 acc = 0 151 start = 0 152 for gc in GC_skew(seq, window): 153 r1 = r 154 acc+=gc 155 # GC skew 156 alpha = pi - (2*pi*start)/len(seq) 157 r2 = r1 - gc*zoom 158 x1 = X0 + r1 * sin(alpha) 159 y1 = Y0 + r1 * cos(alpha) 160 x2 = X0 + r2 * sin(alpha) 161 y2 = Y0 + r2 * cos(alpha) 162 canvas.create_line(x1,y1,x2,y2, fill = 'blue') 163 # accumulated GC skew 164 r1 = r - 50 165 r2 = r1 - acc 166 x1 = X0 + r1 * sin(alpha) 167 y1 = Y0 + r1 * cos(alpha) 168 x2 = X0 + r2 * sin(alpha) 169 y2 = Y0 + r2 * cos(alpha) 170 canvas.create_line(x1,y1,x2,y2, fill = 'magenta') 171 172 canvas.update() 173 start += window 174 175 canvas.configure(scrollregion = canvas.bbox(ALL))
176
177 -def molecular_weight(seq):
178 """Calculate the molecular weight of a DNA sequence.""" 179 if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna) 180 weight_table = IUPACData.unambiguous_dna_weights 181 #TODO, use a generator expession once we drop Python 2.3? 182 #e.g. return sum(weight_table[x] for x in seq) 183 total = 0 184 for x in seq: 185 total += weight_table[x] 186 return total
187
188 -def nt_search(seq, subseq):
189 """Search for a DNA subseq in sequence. 190 191 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 192 searches only on forward strand 193 """ 194 pattern = '' 195 for nt in subseq: 196 value = IUPACData.ambiguous_dna_values[nt] 197 if len(value) == 1: 198 pattern += value 199 else: 200 pattern += '[%s]' % value 201 202 pos = -1 203 result = [pattern] 204 l = len(seq) 205 while True: 206 pos+=1 207 s = seq[pos:] 208 m = re.search(pattern, s) 209 if not m: break 210 pos += int(m.start(0)) 211 result.append(pos) 212 return result
213 214 # }}} 215 216 ###################################### 217 # Protein 218 ###################### 219 # {{{ 220 221 # temporary hack for exception free translation of "dirty" DNA 222 # should be moved to ??? 223
224 -class ProteinX(Alphabet.ProteinAlphabet):
225 letters = IUPACData.extended_protein_letters + "X"
226 227 proteinX = ProteinX() 228
229 -class MissingTable:
230 - def __init__(self, table):
231 self._table = table
232 - def get(self, codon, stop_symbol):
233 try: 234 return self._table.get(codon, stop_symbol) 235 except CodonTable.TranslationError: 236 return 'X'
237
238 -def makeTableX(table):
239 assert table.protein_alphabet == IUPAC.extended_protein 240 return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX, 241 MissingTable(table.forward_table), 242 table.back_table, table.start_codons, 243 table.stop_codons)
244 245 # end of hacks 246
247 -def seq3(seq):
248 """Turn a one letter code protein sequence into one with three letter codes. 249 250 The single input argument 'seq' should be a protein sequence using single 251 letter codes, either as a python string or as a Seq or MutableSeq object. 252 253 This function returns the amino acid sequence as a string using the three 254 letter amino acid codes. Output follows the IUPAC standard (including 255 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 256 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown 257 character (including possible gap characters), is changed into 'Xaa'. 258 259 e.g. 260 >>> from Bio.SeqUtils import seq3 261 >>> seq3("MAIVMGRWKGAR*") 262 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 263 264 This function was inspired by BioPerl's seq3. 265 """ 266 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', 267 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', 268 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', 269 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', 270 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', 271 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter', 272 'U':'Sel', 'O':'Pyl', 'J':'Xle', 273 } 274 #We use a default of 'Xaa' for undefined letters 275 #Note this will map '-' to 'Xaa' which may be undesirable! 276 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
277 278 279 # }}} 280 281 ###################################### 282 # Mixed ??? 283 ###################### 284 # {{{ 285
286 -def translate(seq, frame = 1, genetic_code = 1, translator = None):
287 """Translation of DNA in one of the six different reading frames (DEPRECATED). 288 289 Use the Bio.Seq.Translate function, or the Seq object's translate method 290 instead: 291 292 >>> from Bio.Seq import Seq 293 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG") 294 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUA") 295 >>> for frame in [0,1,2] : 296 ... print my_seq[frame:].translate() 297 ... 298 MAIVMGR*KGAR* 299 WPL*WAAERVPDS 300 GHCNGPLKGCPIV 301 >>> for frame in [0,1,2] : 302 ... print my_seq.reverse_complement()[frame:].translate() 303 ... 304 YYRAPFQRPITMA 305 TIGHPFSGPLQWP 306 LSGTLSAAHYNGH 307 """ 308 import warnings 309 warnings.warn("Bio.SeqUtils.translate() has been deprecated, and we intend" \ 310 +" to remove it in a future release of Biopython. Please use"\ 311 +" the method or function in Bio.Seq instead, as described in"\ 312 +" the Tutorial.", DeprecationWarning) 313 314 if frame not in [1,2,3,-1,-2,-3]: 315 raise ValueError('invalid frame') 316 317 if not translator: 318 table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code]) 319 translator = Translate.Translator(table) 320 321 #Does this frame calculation do something sensible? No RC taken! 322 return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
323
324 -def GC_Frame(seq, genetic_code = 1):
325 """Just an alias for six_frame_translations (OBSOLETE). 326 327 Use six_frame_translation directly, as this function may be deprecated 328 in a future release.""" 329 return six_frame_translations(seq, genetic_code)
330
331 -def six_frame_translations(seq, genetic_code = 1):
332 """Formatted string showing the 6 frame translations and GC content. 333 334 nice looking 6 frame translation with GC content - code from xbbtools 335 similar to DNA Striders six-frame translation 336 337 e.g. 338 from Bio.SeqUtils import six_frame_translations 339 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA") 340 """ 341 from Bio.Seq import reverse_complement, translate 342 anti = reverse_complement(seq) 343 comp = anti[::-1] 344 length = len(seq) 345 frames = {} 346 for i in range(0,3): 347 frames[i+1] = translate(seq[i:], genetic_code) 348 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code)) 349 350 # create header 351 if length > 20: 352 short = '%s ... %s' % (seq[:10], seq[-10:]) 353 else: 354 short = seq 355 #TODO? Remove the date as this would spoil any unit test... 356 date = time.strftime('%y %b %d, %X', time.localtime(time.time())) 357 header = 'GC_Frame: %s, ' % date 358 for nt in ['a','t','g','c']: 359 header += '%s:%d ' % (nt, seq.count(nt.upper())) 360 361 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq)) 362 res = header 363 364 for i in range(0,length,60): 365 subseq = seq[i:i+60] 366 csubseq = comp[i:i+60] 367 p = i/3 368 res = res + '%d/%d\n' % (i+1, i/3+1) 369 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n' 370 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n' 371 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n' 372 # seq 373 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq)) 374 res = res + csubseq.lower() + '\n' 375 # - frames 376 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n' 377 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n' 378 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n' 379 return res
380 381 # }}} 382 383 ###################################### 384 # FASTA file utilities 385 ###################### 386 # {{{ 387
388 -def fasta_uniqids(file):
389 """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE). 390 391 file - a FASTA format filename to read in. 392 393 No return value, the output is written to screen. 394 """ 395 dict = {} 396 txt = open(file).read() 397 entries = [] 398 for entry in txt.split('>')[1:]: 399 name, seq= entry.split('\n',1) 400 name = name.split()[0].split(',')[0] 401 402 if name in dict: 403 n = 1 404 while 1: 405 n = n + 1 406 _name = name + str(n) 407 if _name not in dict: 408 name = _name 409 break 410 411 dict[name] = seq 412 413 for name, seq in dict.items(): 414 print '>%s\n%s' % (name, seq)
415
416 -def quick_FASTA_reader(file):
417 """Simple FASTA reader, returning a list of string tuples. 418 419 The single argument 'file' should be the filename of a FASTA format file. 420 This function will open and read in the entire file, constructing a list 421 of all the records, each held as a tuple of strings (the sequence name or 422 title, and its sequence). 423 424 This function was originally intended for use on large files, where its 425 low overhead makes it very fast. However, because it returns the data as 426 a single in memory list, this can require a lot of RAM on large files. 427 428 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which 429 allows you to iterate over the records one by one (avoiding having all the 430 records in memory at once). Using Bio.SeqIO also makes it easy to switch 431 between different input file formats. However, please note that rather 432 than simple strings, Bio.SeqIO uses SeqRecord objects for each record. 433 """ 434 #Want to split on "\n>" not just ">" in case there are any extra ">" 435 #in the name/description. So, in order to make sure we also split on 436 #the first entry, prepend a "\n" to the start of the file. 437 handle = open(file) 438 txt = "\n" + handle.read() 439 handle.close() 440 entries = [] 441 for entry in txt.split('\n>')[1:]: 442 name,seq= entry.split('\n',1) 443 seq = seq.replace('\n','').replace(' ','').upper() 444 entries.append((name, seq)) 445 return entries
446
447 -def apply_on_multi_fasta(file, function, *args):
448 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). 449 450 file - filename of a FASTA format file 451 function - the function you wish to invoke on each record 452 *args - any extra arguments you want passed to the function 453 454 This function will iterate over each record in a FASTA file as SeqRecord 455 objects, calling your function with the record (and supplied args) as 456 arguments. 457 458 This function returns a list. For those records where your function 459 returns a value, this is taken as a sequence and used to construct a 460 FASTA format string. If your function never has a return value, this 461 means apply_on_multi_fasta will return an empty list. 462 """ 463 try: 464 f = globals()[function] 465 except: 466 raise NotImplementedError("%s not implemented" % function) 467 468 handle = open(file, 'r') 469 records = SeqIO.parse(handle, "fasta") 470 results = [] 471 for record in records: 472 arguments = [record.sequence] 473 for arg in args: arguments.append(arg) 474 result = f(*arguments) 475 if result: 476 results.append('>%s\n%s' % (record.name, result)) 477 handle.close() 478 return results
479
480 -def quicker_apply_on_multi_fasta(file, function, *args):
481 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). 482 483 file - filename of a FASTA format file 484 function - the function you wish to invoke on each record 485 *args - any extra arguments you want passed to the function 486 487 This function will use quick_FASTA_reader to load every record in the 488 FASTA file into memory as a list of tuples. For each record, it will 489 call your supplied function with the record as a tuple of the name and 490 sequence as strings (plus any supplied args). 491 492 This function returns a list. For those records where your function 493 returns a value, this is taken as a sequence and used to construct a 494 FASTA format string. If your function never has a return value, this 495 means quicker_apply_on_multi_fasta will return an empty list. 496 """ 497 try: 498 f = globals()[function] 499 except: 500 raise NotImplementedError("%s not implemented" % function) 501 502 entries = quick_FASTA_reader(file) 503 results = [] 504 for name, seq in entries: 505 arguments = [seq] 506 for arg in args: arguments.append(arg) 507 result = f(*arguments) 508 if result: 509 results.append('>%s\n%s' % (name, result)) 510 handle.close() 511 return results
512 513 # }}} 514 515 ###################################### 516 # Main 517 ##################### 518 # {{{ 519 520 if __name__ == '__main__': 521 import sys, getopt 522 # crude command line options to use most functions directly on a FASTA file 523 options = {'apply_on_multi_fasta':0, 524 'quick':0, 525 'uniq_ids':0, 526 } 527 528 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=', 529 'help', 'quick', 'uniq_ids', 'search=']) 530 for arg in optlist: 531 if arg[0] in ['-h', '--help']: 532 pass 533 elif arg[0] in ['--describe']: 534 # get all new functions from this file 535 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)] 536 mol_funcs.sort() 537 print 'available functions:' 538 for f in mol_funcs: print '\t--%s' % f 539 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas' 540 541 sys.exit(0) 542 elif arg[0] in ['--apply_on_multi_fasta']: 543 options['apply_on_multi_fasta'] = arg[1] 544 elif arg[0] in ['--search']: 545 options['search'] = arg[1] 546 else: 547 key = re.search('-*(.+)', arg[0]).group(1) 548 options[key] = 1 549 550 551 if options.get('apply_on_multi_fasta'): 552 file = args[0] 553 function = options['apply_on_multi_fasta'] 554 arguments = [] 555 if options.get('search'): 556 arguments = options['search'] 557 if function == 'xGC_skew': 558 arguments = 1000 559 if options.get('quick'): 560 results = quicker_apply_on_multi_fasta(file, function, arguments) 561 else: 562 results = apply_on_multi_fasta(file, function, arguments) 563 for result in results: print result 564 565 elif options.get('uniq_ids'): 566 file = args[0] 567 fasta_uniqids(file) 568 569 # }}} 570