1
2
3
4
5
6
7
8
9
10 """Miscellaneous functions for dealing with sequences."""
11
12 import re, time
13 from Bio import SeqIO
14 from Bio import Translate
15 from Bio.Seq import Seq
16 from Bio import Alphabet
17 from Bio.Alphabet import IUPAC
18 from Bio.Data import IUPACData, CodonTable
19
20
21
22
23
24
25
27 """Reverse the sequence. Works on string sequences.
28
29 e.g.
30 >>> reverse("ACGGT")
31 'TGGCA'
32
33 """
34 r = list(seq)
35 r.reverse()
36 return ''.join(r)
37
39 """Calculates G+C content, returns the percentage (float between 0 and 100).
40
41 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C)
42 when counting the G and C content. The percentage is calculated against
43 the full length, e.g.:
44
45 >>> from Bio.SeqUtils import GC
46 >>> GC("ACTGN")
47 40.0
48
49 Note that this will return zero for an empty sequence.
50 """
51 try :
52 gc = sum(map(seq.count,['G','C','g','c','S','s']))
53 return gc*100.0/len(seq)
54 except ZeroDivisionError :
55 return 0.0
56
57
59 """Calculates total G+C content plus first, second and third positions.
60
61 Returns a tuple of four floats (percentages between 0 and 100) for the
62 entire sequence, and the three codon positions. e.g.
63
64 >>> from Bio.SeqUtils import GC123
65 >>> GC123("ACTGTN")
66 (40.0, 50.0, 50.0, 0.0)
67
68 Copes with mixed case sequences, but does NOT deal with ambiguous
69 nucleotides.
70 """
71 d= {}
72 for nt in ['A','T','G','C']:
73 d[nt] = [0,0,0]
74
75 for i in range(0,len(seq),3):
76 codon = seq[i:i+3]
77 if len(codon) <3: codon += ' '
78 for pos in range(0,3):
79 for nt in ['A','T','G','C']:
80 if codon[pos] == nt or codon[pos] == nt.lower():
81 d[nt][pos] += 1
82 gc = {}
83 gcall = 0
84 nall = 0
85 for i in range(0,3):
86 try:
87 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
88 gc[i] = (d['G'][i] + d['C'][i])*100.0/n
89 except:
90 gc[i] = 0
91
92 gcall = gcall + d['G'][i] + d['C'][i]
93 nall = nall + n
94
95 gcall = 100.0*gcall/nall
96 return gcall, gc[0], gc[1], gc[2]
97
99 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
100
101 Returns a list of ratios (floats), controlled by the length of the sequence
102 and the size of the window.
103
104 Does NOT look at any ambiguous nucleotides.
105 """
106
107 values = []
108 for i in range(0, len(seq), window):
109 s = seq[i: i + window]
110 g = s.count('G') + s.count('g')
111 c = s.count('C') + s.count('c')
112 skew = (g-c)/float(g+c)
113 values.append(skew)
114 return values
115
116 from math import pi, sin, cos, log
117 -def xGC_skew(seq, window = 1000, zoom = 100,
118 r = 300, px = 100, py = 100):
119 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
120 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
121 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
122 yscroll = Scrollbar(orient = VERTICAL)
123 xscroll = Scrollbar(orient = HORIZONTAL)
124 canvas = Canvas(yscrollcommand = yscroll.set,
125 xscrollcommand = xscroll.set, background = 'white')
126 win = canvas.winfo_toplevel()
127 win.geometry('700x700')
128
129 yscroll.config(command = canvas.yview)
130 xscroll.config(command = canvas.xview)
131 yscroll.pack(side = RIGHT, fill = Y)
132 xscroll.pack(side = BOTTOM, fill = X)
133 canvas.pack(fill=BOTH, side = LEFT, expand = 1)
134 canvas.update()
135
136 X0, Y0 = r + px, r + py
137 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
138
139 ty = Y0
140 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
141 ty +=20
142 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
143 ty +=20
144 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
145 ty +=20
146 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
147 ty +=20
148 canvas.create_oval(x1,y1, x2, y2)
149
150 acc = 0
151 start = 0
152 for gc in GC_skew(seq, window):
153 r1 = r
154 acc+=gc
155
156 alpha = pi - (2*pi*start)/len(seq)
157 r2 = r1 - gc*zoom
158 x1 = X0 + r1 * sin(alpha)
159 y1 = Y0 + r1 * cos(alpha)
160 x2 = X0 + r2 * sin(alpha)
161 y2 = Y0 + r2 * cos(alpha)
162 canvas.create_line(x1,y1,x2,y2, fill = 'blue')
163
164 r1 = r - 50
165 r2 = r1 - acc
166 x1 = X0 + r1 * sin(alpha)
167 y1 = Y0 + r1 * cos(alpha)
168 x2 = X0 + r2 * sin(alpha)
169 y2 = Y0 + r2 * cos(alpha)
170 canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
171
172 canvas.update()
173 start += window
174
175 canvas.configure(scrollregion = canvas.bbox(ALL))
176
187
213
214
215
216
217
218
219
220
221
222
223
224 -class ProteinX(Alphabet.ProteinAlphabet):
226
227 proteinX = ProteinX()
228
232 - def get(self, codon, stop_symbol):
237
244
245
246
248 """Turn a one letter code protein sequence into one with three letter codes.
249
250 The single input argument 'seq' should be a protein sequence using single
251 letter codes, either as a python string or as a Seq or MutableSeq object.
252
253 This function returns the amino acid sequence as a string using the three
254 letter amino acid codes. Output follows the IUPAC standard (including
255 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
256 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown
257 character (including possible gap characters), is changed into 'Xaa'.
258
259 e.g.
260 >>> from Bio.SeqUtils import seq3
261 >>> seq3("MAIVMGRWKGAR*")
262 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
263
264 This function was inspired by BioPerl's seq3.
265 """
266 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
267 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
268 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
269 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
270 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
271 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
272 'U':'Sel', 'O':'Pyl', 'J':'Xle',
273 }
274
275
276 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
277
278
279
280
281
282
283
284
285
286 -def translate(seq, frame = 1, genetic_code = 1, translator = None):
287 """Translation of DNA in one of the six different reading frames (DEPRECATED).
288
289 Use the Bio.Seq.Translate function, or the Seq object's translate method
290 instead:
291
292 >>> from Bio.Seq import Seq
293 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
294 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUA")
295 >>> for frame in [0,1,2] :
296 ... print my_seq[frame:].translate()
297 ...
298 MAIVMGR*KGAR*
299 WPL*WAAERVPDS
300 GHCNGPLKGCPIV
301 >>> for frame in [0,1,2] :
302 ... print my_seq.reverse_complement()[frame:].translate()
303 ...
304 YYRAPFQRPITMA
305 TIGHPFSGPLQWP
306 LSGTLSAAHYNGH
307 """
308 import warnings
309 warnings.warn("Bio.SeqUtils.translate() has been deprecated, and we intend" \
310 +" to remove it in a future release of Biopython. Please use"\
311 +" the method or function in Bio.Seq instead, as described in"\
312 +" the Tutorial.", DeprecationWarning)
313
314 if frame not in [1,2,3,-1,-2,-3]:
315 raise ValueError('invalid frame')
316
317 if not translator:
318 table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code])
319 translator = Translate.Translator(table)
320
321
322 return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
323
325 """Just an alias for six_frame_translations (OBSOLETE).
326
327 Use six_frame_translation directly, as this function may be deprecated
328 in a future release."""
329 return six_frame_translations(seq, genetic_code)
330
332 """Formatted string showing the 6 frame translations and GC content.
333
334 nice looking 6 frame translation with GC content - code from xbbtools
335 similar to DNA Striders six-frame translation
336
337 e.g.
338 from Bio.SeqUtils import six_frame_translations
339 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
340 """
341 from Bio.Seq import reverse_complement, translate
342 anti = reverse_complement(seq)
343 comp = anti[::-1]
344 length = len(seq)
345 frames = {}
346 for i in range(0,3):
347 frames[i+1] = translate(seq[i:], genetic_code)
348 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
349
350
351 if length > 20:
352 short = '%s ... %s' % (seq[:10], seq[-10:])
353 else:
354 short = seq
355
356 date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
357 header = 'GC_Frame: %s, ' % date
358 for nt in ['a','t','g','c']:
359 header += '%s:%d ' % (nt, seq.count(nt.upper()))
360
361 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
362 res = header
363
364 for i in range(0,length,60):
365 subseq = seq[i:i+60]
366 csubseq = comp[i:i+60]
367 p = i/3
368 res = res + '%d/%d\n' % (i+1, i/3+1)
369 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
370 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
371 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
372
373 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
374 res = res + csubseq.lower() + '\n'
375
376 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
377 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
378 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
379 return res
380
381
382
383
384
385
386
387
389 """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE).
390
391 file - a FASTA format filename to read in.
392
393 No return value, the output is written to screen.
394 """
395 dict = {}
396 txt = open(file).read()
397 entries = []
398 for entry in txt.split('>')[1:]:
399 name, seq= entry.split('\n',1)
400 name = name.split()[0].split(',')[0]
401
402 if name in dict:
403 n = 1
404 while 1:
405 n = n + 1
406 _name = name + str(n)
407 if _name not in dict:
408 name = _name
409 break
410
411 dict[name] = seq
412
413 for name, seq in dict.items():
414 print '>%s\n%s' % (name, seq)
415
417 """Simple FASTA reader, returning a list of string tuples.
418
419 The single argument 'file' should be the filename of a FASTA format file.
420 This function will open and read in the entire file, constructing a list
421 of all the records, each held as a tuple of strings (the sequence name or
422 title, and its sequence).
423
424 This function was originally intended for use on large files, where its
425 low overhead makes it very fast. However, because it returns the data as
426 a single in memory list, this can require a lot of RAM on large files.
427
428 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
429 allows you to iterate over the records one by one (avoiding having all the
430 records in memory at once). Using Bio.SeqIO also makes it easy to switch
431 between different input file formats. However, please note that rather
432 than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
433 """
434
435
436
437 handle = open(file)
438 txt = "\n" + handle.read()
439 handle.close()
440 entries = []
441 for entry in txt.split('\n>')[1:]:
442 name,seq= entry.split('\n',1)
443 seq = seq.replace('\n','').replace(' ','').upper()
444 entries.append((name, seq))
445 return entries
446
448 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
449
450 file - filename of a FASTA format file
451 function - the function you wish to invoke on each record
452 *args - any extra arguments you want passed to the function
453
454 This function will iterate over each record in a FASTA file as SeqRecord
455 objects, calling your function with the record (and supplied args) as
456 arguments.
457
458 This function returns a list. For those records where your function
459 returns a value, this is taken as a sequence and used to construct a
460 FASTA format string. If your function never has a return value, this
461 means apply_on_multi_fasta will return an empty list.
462 """
463 try:
464 f = globals()[function]
465 except:
466 raise NotImplementedError("%s not implemented" % function)
467
468 handle = open(file, 'r')
469 records = SeqIO.parse(handle, "fasta")
470 results = []
471 for record in records:
472 arguments = [record.sequence]
473 for arg in args: arguments.append(arg)
474 result = f(*arguments)
475 if result:
476 results.append('>%s\n%s' % (record.name, result))
477 handle.close()
478 return results
479
481 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
482
483 file - filename of a FASTA format file
484 function - the function you wish to invoke on each record
485 *args - any extra arguments you want passed to the function
486
487 This function will use quick_FASTA_reader to load every record in the
488 FASTA file into memory as a list of tuples. For each record, it will
489 call your supplied function with the record as a tuple of the name and
490 sequence as strings (plus any supplied args).
491
492 This function returns a list. For those records where your function
493 returns a value, this is taken as a sequence and used to construct a
494 FASTA format string. If your function never has a return value, this
495 means quicker_apply_on_multi_fasta will return an empty list.
496 """
497 try:
498 f = globals()[function]
499 except:
500 raise NotImplementedError("%s not implemented" % function)
501
502 entries = quick_FASTA_reader(file)
503 results = []
504 for name, seq in entries:
505 arguments = [seq]
506 for arg in args: arguments.append(arg)
507 result = f(*arguments)
508 if result:
509 results.append('>%s\n%s' % (name, result))
510 handle.close()
511 return results
512
513
514
515
516
517
518
519
520 if __name__ == '__main__':
521 import sys, getopt
522
523 options = {'apply_on_multi_fasta':0,
524 'quick':0,
525 'uniq_ids':0,
526 }
527
528 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
529 'help', 'quick', 'uniq_ids', 'search='])
530 for arg in optlist:
531 if arg[0] in ['-h', '--help']:
532 pass
533 elif arg[0] in ['--describe']:
534
535 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
536 mol_funcs.sort()
537 print 'available functions:'
538 for f in mol_funcs: print '\t--%s' % f
539 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas'
540
541 sys.exit(0)
542 elif arg[0] in ['--apply_on_multi_fasta']:
543 options['apply_on_multi_fasta'] = arg[1]
544 elif arg[0] in ['--search']:
545 options['search'] = arg[1]
546 else:
547 key = re.search('-*(.+)', arg[0]).group(1)
548 options[key] = 1
549
550
551 if options.get('apply_on_multi_fasta'):
552 file = args[0]
553 function = options['apply_on_multi_fasta']
554 arguments = []
555 if options.get('search'):
556 arguments = options['search']
557 if function == 'xGC_skew':
558 arguments = 1000
559 if options.get('quick'):
560 results = quicker_apply_on_multi_fasta(file, function, arguments)
561 else:
562 results = apply_on_multi_fasta(file, function, arguments)
563 for result in results: print result
564
565 elif options.get('uniq_ids'):
566 file = args[0]
567 fasta_uniqids(file)
568
569
570