1
2
3
4
5
6
7
8
9
10 import re, time
11 from Bio import SeqIO
12 from Bio import Translate
13 from Bio.Seq import Seq
14 from Bio import Alphabet
15 from Bio.Alphabet import IUPAC
16 from Bio.Data import IUPACData, CodonTable
17
18
19
20
21
22
23
25 """Reverse the sequence. Works on string sequences.
26 """
27 r = map(None, seq)
28 r.reverse()
29 return ''.join(r)
30
32 """ calculates G+C content """
33
34
35
36
37 d = {}
38 for nt in ['A','T','G','C','a','t','g','c','S','s']:
39 d[nt] = seq.count(nt)
40 gc = d.get('G',0) + d.get('C',0) + d.get('g',0) + d.get('c',0) + \
41 d.get('S',0) + d.get('s',0)
42
43 if gc == 0: return 0
44
45 return gc*100.0/len(seq)
46
48 " calculates total G+C content plus first, second and third position "
49
50 d= {}
51 for nt in ['A','T','G','C']:
52 d[nt] = [0,0,0]
53
54 for i in range(0,len(seq),3):
55 codon = seq[i:i+3]
56 if len(codon) <3: codon += ' '
57 for pos in range(0,3):
58 for nt in ['A','T','G','C']:
59 if codon[pos] == nt or codon[pos] == nt.lower():
60 d[nt][pos] = d[nt][pos] +1
61
62
63 gc = {}
64 gcall = 0
65 nall = 0
66 for i in range(0,3):
67 try:
68 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
69 gc[i] = (d['G'][i] + d['C'][i])*100.0/n
70 except:
71 gc[i] = 0
72
73 gcall = gcall + d['G'][i] + d['C'][i]
74 nall = nall + n
75
76 gcall = 100.0*gcall/nall
77 return gcall, gc[0], gc[1], gc[2]
78
90
91
92
93
94
95
96
97
98 try:
99 from Tkinter import *
100 except ImportError:
101 pass
102 from math import pi, sin, cos, log
103 -def xGC_skew(seq, window = 1000, zoom = 100,
104 r = 300, px = 100, py = 100):
105 " calculates and plots normal and accumulated GC skew (GRAPHICS !!!) "
106
107
108 yscroll = Scrollbar(orient = VERTICAL)
109 xscroll = Scrollbar(orient = HORIZONTAL)
110 canvas = Canvas(yscrollcommand = yscroll.set,
111 xscrollcommand = xscroll.set, background = 'white')
112 win = canvas.winfo_toplevel()
113 win.geometry('700x700')
114
115 yscroll.config(command = canvas.yview)
116 xscroll.config(command = canvas.xview)
117 yscroll.pack(side = RIGHT, fill = Y)
118 xscroll.pack(side = BOTTOM, fill = X)
119 canvas.pack(fill=BOTH, side = LEFT, expand = 1)
120 canvas.update()
121
122 X0, Y0 = r + px, r + py
123 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
124
125 ty = Y0
126 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
127 ty +=20
128 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
129 ty +=20
130 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
131 ty +=20
132 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
133 ty +=20
134 canvas.create_oval(x1,y1, x2, y2)
135
136 acc = 0
137 start = 0
138 for gc in GC_skew(seq, window):
139 r1 = r
140 acc+=gc
141
142 alpha = pi - (2*pi*start)/len(seq)
143 r2 = r1 - gc*zoom
144 x1 = X0 + r1 * sin(alpha)
145 y1 = Y0 + r1 * cos(alpha)
146 x2 = X0 + r2 * sin(alpha)
147 y2 = Y0 + r2 * cos(alpha)
148 canvas.create_line(x1,y1,x2,y2, fill = 'blue')
149
150 r1 = r - 50
151 r2 = r1 - acc
152 x1 = X0 + r1 * sin(alpha)
153 y1 = Y0 + r1 * cos(alpha)
154 x2 = X0 + r2 * sin(alpha)
155 y2 = Y0 + r2 * cos(alpha)
156 canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
157
158 canvas.update()
159 start = start + window
160
161
162 canvas.configure(scrollregion = canvas.bbox(ALL))
163
171
173 """ search for a DNA subseq in sequence
174 use ambiguous values (like N = A or T or C or G, R = A or G etc.)
175 searches only on forward strand
176 """
177 pattern = ''
178 for nt in subseq:
179 value = IUPACData.ambiguous_dna_values[nt]
180 if len(value) == 1:
181 pattern += value
182 else:
183 pattern += '[%s]' % value
184
185 pos = -1
186 result = [pattern]
187 l = len(seq)
188 while 1:
189 pos+=1
190 s = seq[pos:]
191 m = re.search(pattern, s)
192 if not m: break
193 pos += int(m.start(0))
194 result.append(pos)
195
196 return result
197
198
199
200
201
202
203
204
205
206
207
208 -class ProteinX(Alphabet.ProteinAlphabet):
210
211 proteinX = ProteinX()
212
216 - def get(self, codon, stop_symbol):
221
228
229
230
231
233 """
234 Method that returns the amino acid sequence as a
235 list of three letter codes. Output follows the IUPAC standard plus 'Ter' for
236 terminator. Any unknown character, including the default
237 unknown character 'X', is changed into 'Xaa'. A noncoded
238 aminoacid selenocystein is recognized (Sel, U).
239 """
240 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
241 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
242 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
243 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
244 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
245 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
246 'U':'Sel'
247 }
248
249 return ''.join([threecode.get(aa,'Xer') for aa in seq])
250
251
252
253
254
255
256
257
258
259 -def translate(seq, frame = 1, genetic_code = 1, translator = None):
269
273
275 """
276 nice looking 6 frame translation with GC content - code from xbbtools
277 similar to DNA Striders six-frame translation
278 """
279 from Bio.Seq import reverse_complement
280 anti = reverse_complement(seq)
281 comp = anti[::-1]
282 length = len(seq)
283 frames = {}
284 for i in range(0,3):
285 frames[i+1] = translate(seq[i:], genetic_code)
286 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
287
288
289 if length > 20:
290 short = '%s ... %s' % (seq[:10], seq[-10:])
291 else:
292 short = seq
293 date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
294 header = 'GC_Frame: %s, ' % date
295 for nt in ['a','t','g','c']:
296 header += '%s:%d ' % (nt, seq.count(nt.upper()))
297
298 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
299 res = header
300
301 for i in range(0,length,60):
302 subseq = seq[i:i+60]
303 csubseq = comp[i:i+60]
304 p = i/3
305 res = res + '%d/%d\n' % (i+1, i/3+1)
306 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
307 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
308 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
309
310 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
311 res = res + csubseq.lower() + '\n'
312
313 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
314 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
315 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
316 return res
317
318
319
320
321
322
323
324
347
349 " simple and FASTA reader, preferable to be used on large files "
350 txt = open(file).read()
351 entries = []
352 for entry in txt.split('>')[1:]:
353 name,seq= entry.split('\n',1)
354 seq = seq.replace('\n','').replace(' ','').upper()
355 entries.append((name, seq))
356
357 return entries
358
360 " apply function on each sequence in a multiple FASTA file "
361 try:
362 f = globals()[function]
363 except:
364 raise NotImplementedError, "%s not implemented" % function
365
366 handle = open(file, 'r')
367 records = SeqIO.parse(handle, "fasta")
368 results = []
369 for record in records:
370 arguments = [record.sequence]
371 for arg in args: arguments.append(arg)
372 result = f(*arguments)
373 if result:
374 results.append('>%s\n%s' % (record.title, result))
375 return results
376
378 " apply function on each sequence in a multiple FASTA file "
379 try:
380 f = globals()[function]
381 except:
382 raise NotImplementedError, "%s not implemented" % function
383
384 entries = quick_FASTA_reader(file)
385 results = []
386 for name, seq in entries:
387 arguments = [seq]
388 for arg in args: arguments.append(arg)
389 result = f(*arguments)
390 if result:
391 results.append('>%s\n%s' % (name, result))
392 return results
393
394
395
396
397
398
399
400
401 if __name__ == '__main__':
402 import sys, getopt
403
404 options = {'apply_on_multi_fasta':0,
405 'quick':0,
406 'uniq_ids':0,
407 }
408
409 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
410 'help', 'quick', 'uniq_ids', 'search='])
411 for arg in optlist:
412 if arg[0] in ['-h', '--help']:
413 pass
414 elif arg[0] in ['--describe']:
415
416 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
417 mol_funcs.sort()
418 print 'available functions:'
419 for f in mol_funcs: print '\t--%s' % f
420 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas'
421
422 sys.exit(0)
423 elif arg[0] in ['--apply_on_multi_fasta']:
424 options['apply_on_multi_fasta'] = arg[1]
425 elif arg[0] in ['--search']:
426 options['search'] = arg[1]
427 else:
428 key = re.search('-*(.+)', arg[0]).group(1)
429 options[key] = 1
430
431
432 if options.get('apply_on_multi_fasta'):
433 file = args[0]
434 function = options['apply_on_multi_fasta']
435 arguments = []
436 if options.get('search'):
437 arguments = options['search']
438 if function == 'xGC_skew':
439 arguments = 1000
440 if options.get('quick'):
441 results = quicker_apply_on_multi_fasta(file, function, arguments)
442 else:
443 results = apply_on_multi_fasta(file, function, arguments)
444 for result in results: print result
445
446 elif options.get('uniq_ids'):
447 file = args[0]
448 fasta_uniqids(file)
449
450
451