1
2
3
4
5
6
7
8 """
9 Bio.GFF.easy: some functions to ease the use of Biopython
10 """
11
12 __version__ = "$Revision: 1.12 $"
13
14
15 import copy
16 import re
17 import sys
18
19 import Bio
20 from Bio import GenBank
21 from Bio.Data import IUPACData
22 from Bio.Seq import Seq
23
24 from Bio import SeqIO
25 from Bio import SeqUtils
26
27 import GenericTools
28
30 """ JH: accessing feature.qualifiers as a list is stupid. Here's a dict that does it"""
31 - def __init__(self, feature_list, default=None):
45
46 -class Location(GenericTools.VerboseList):
47 """
48 this is really best interfaced through LocationFromString
49 fuzzy: < or >
50 join: {0 = no join, 1 = join, 2 = order}
51
52 >>> location = Location([Location([339]), Location([564])]) # zero-based
53 >>> location
54 Location(Location(339), Location(564))
55 >>> print location # one-based
56 340..565
57 >>> print location.five_prime()
58 340
59 >>> location_rev = Location([Location([339]), Location([564])], 1)
60 >>> print location_rev
61 complement(340..565)
62 >>> print location_rev.five_prime()
63 565
64 """
65 - def __init__(self, the_list, complement=0, seqname=None):
71
73 if self.join == 1:
74 label = 'join'
75 elif self.join == 2:
76 label = 'order'
77 return "%s(%s)" % (label, ",".join(map(str, self)))
78
80 if self.seqname:
81 format = "%s:%%s" % self.seqname
82 else:
83 format = "%s"
84
85 if self.complement:
86 format = format % "complement(%s)"
87
88 if self.join:
89 return format % self._joinstr()
90
91 elif isinstance(self[0], list):
92 return format % "%s..%s" % (str(self[0]), str(self[1]))
93 else:
94 if self.fuzzy:
95 format = format % self.fuzzy + "%s"
96 return format % str(self[0] + 1)
97
99 return "Location(%s)" % ", ".join(map(repr, self))
100
101 direction2index = {1: 0, -1: -1}
103 """
104 1: 5'
105 -1: 3'
106
107 >>> loc1 = LocationFromString("join(1..3,complement(5..6))")
108 >>> loc1.direction_and_index(1)
109 (1, 0)
110 >>> loc1.direction_and_index(-1)
111 (-1, -1)
112 >>> loc1.reverse()
113 >>> print loc1
114 complement(join(1..3,complement(5..6)))
115 >>> loc1.direction_and_index(1)
116 (-1, -1)
117 """
118 if self.complement:
119 direction = direction * -1
120 index = self.direction2index[direction]
121 return direction, index
122
124 """
125 >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))")
126 >>> loc.findside(1)
127 Location(5)
128 >>> loc.findside(-1)
129 Location(0)
130 """
131 direction, index = self.direction_and_index(direction)
132 if self.join or isinstance(self[0], list):
133 return self[index].findside(direction)
134 else:
135 return self
136
138 """
139 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
140 >>> loc.findseqname_3prime()
141 'MOOCOW'
142 """
143 return self.findseqname(-1)
144
146 """
147 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
148 >>> loc.findseqname()
149 'SEQ'
150 >>> loc.findseqname(-1)
151 'MOOCOW'
152 """
153 direction, index = self.direction_and_index(direction)
154 if self.seqname:
155 return self.seqname
156 elif self.join:
157 return self[index].findseqname(direction)
158 else:
159 raise AttributeError('no sequence name')
160
165
167 """
168 WARNING: doesn't deal with joins!!!!
169 """
170 return self.end()-self.start()
171
173 """
174 WARNING: doesn't deal with joins!!!!
175
176 >>> location1 = LocationFromString("1..50")
177 >>> location2 = LocationFromString("25..200")
178 >>> print location1.intersection(location2)
179 25..50
180 >>> print location1.intersection(location2)
181 25..50
182 """
183 if self.start() >= other.start():
184 start = self.start()
185 else:
186 start = other.start()
187 if self.end() <= other.end():
188 end = self.end()
189 else:
190 end = other.end()
191 return Location([Location([start]), Location([end])])
192
199
206
213
215 """
216 >>> fwd_location = LocationFromString('X:5830132..5831528')
217 >>> print fwd_location.sublocation(LocationFromString('1..101'))
218 X:5830132..5830232
219 >>> print fwd_location.sublocation(LocationFromString('1267..1286'))
220 X:5831398..5831417
221 >>> rev_location = LocationFromString('I:complement(8415686..8416216)')
222 >>> print rev_location.sublocation(LocationFromString('1..101'))
223 I:complement(8416116..8416216)
224 >>> print rev_location.sublocation(LocationFromString('100..200'))
225 I:complement(8416017..8416117)
226 """
227
228 absolute_location = copy.deepcopy(self)
229 for i in xrange(2):
230 absolute_location[i] = self.five_prime().add(sub_location[i], self.complement)
231 if absolute_location.complement:
232 list.reverse(absolute_location)
233 return absolute_location
234
236 return self.add(addend)
237
238 - def add(self, addend, complement=0):
239 self_copy = copy.deepcopy(self)
240 if isinstance(addend, Location):
241 addend = addend[0]
242 if complement:
243 addend *= -1
244 self_copy[0] += addend
245 return self_copy
246
248 return self + -subtrahend
249
252
254 """
255 >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))")
256 >>> loc1.reorient()
257 >>> print loc1
258 complement(join(I:1..9000,I:9001..10000))
259 >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)")
260 >>> loc2.reorient()
261 >>> print loc2
262 join(I:complement(1..9000),I:9001..10000)
263 """
264 if self.join:
265 if len([x for x in self if x.complement]) == len(self):
266 self.reverse()
267 for segment in self:
268 segment.reverse()
269
271 """
272 works for single level non-complex joins
273
274 >>> LOC = LocationFromString
275 >>> l1 = LOC("join(alpha:1..30,alpha:50..70)")
276 >>> print l1.bounding()
277 join(alpha:1..70)
278 >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))")
279 >>> print l2.bounding()
280 join(alpha:1..30,alpha:complement(50..70))
281 >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)")
282 >>> print l3.bounding()
283 join(alpha:1..45,alpha:complement(50..70),beta:6..20)
284
285 """
286 if not self.join:
287 return
288
289 seqdict = {}
290 seqkeys = []
291 for subloc in self:
292 assert subloc.seqname
293 assert not subloc.join
294 try:
295 seqdict[_hashname(subloc)].append(subloc)
296 except KeyError:
297 key = _hashname(subloc)
298 seqdict[key] = [subloc]
299 seqkeys.append(key)
300
301 res = LocationJoin()
302 for key in seqkeys:
303 locations = seqdict[key]
304 coords = []
305 for subloc in locations:
306 coords.append(subloc.start())
307 coords.append(subloc.end())
308 res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname))
309 return res
310
313
315 """
316 >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")])
317 >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))")
318 >>> join.append(appendloc)
319 >>> print join
320 join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55)))
321 >>> join2 = LocationJoin()
322 >>> join2.append(LocationFromString("complement(66..99)"))
323 >>> join2.append(LocationFromString("complement(5..55)"))
324 >>> print join2
325 join(complement(66..99),complement(5..55))
326 """
327 - def __init__(self, the_list = [], complement=0, seqname=None):
333
335 """
336 >>> print LocationFromCoords(339, 564)
337 340..565
338 >>> print LocationFromCoords(339, 564, seqname="I")
339 I:340..565
340 >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434")
341 NC_343434:complement(1000..3235)
342 """
343 - def __init__(self, start, end, strand=0, seqname=None):
349
350
351
352 re_complement = re.compile(r"^complement\((.*)\)$")
353 re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$")
354 re_join = re.compile(r"^(join|order)\((.*)\)$")
355 re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$")
356 re_fuzzy = re.compile(r"^([><])(\d+)")
358 """
359 >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location
360 >>> print LocationFromString("467")
361 467
362 >>> print LocationFromString("340..565")
363 340..565
364 >>> print LocationFromString("<345..500")
365 <345..500
366 >>> print LocationFromString("<1..888")
367 <1..888
368 >>> # (102.110) and 123^124 syntax unimplemented
369 >>> print LocationFromString("join(12..78,134..202)")
370 join(12..78,134..202)
371 >>> print LocationFromString("complement(join(2691..4571,4918..5163))")
372 complement(join(2691..4571,4918..5163))
373 >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))")
374 join(complement(4918..5163),complement(2691..4571))
375 >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))")
376 order(complement(4918..5163),complement(2691..4571))
377 >>> print LocationFromString("NC_001802x.fna:73..78")
378 NC_001802x.fna:73..78
379 >>> print LocationFromString("J00194:100..202")
380 J00194:100..202
381
382 >>> print LocationFromString("join(117505..118584,1..609)")
383 join(117505..118584,1..609)
384 >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))")
385 join(test3:complement(4..6),test3:complement(1..3))
386 >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)")
387 test3:join(test1:complement(1..3),4..6)
388 """
420
422 if filename:
423 return open(filename)
424 else:
425 return sys.stdin
426
428 """
429 >>> record = fasta_single(string=\"""
430 ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]
431 ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT
432 ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG
433 ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA
434 ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT
435 ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC
436 ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG
437 ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR
438 ... SLFGNDPSSQ
439 ... \""")
440 >>> record.id
441 'gi|9629360|ref|NP_057850.1|'
442 >>> record.description
443 'gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]'
444 >>> record.seq[0:5]
445 Seq('MGARA', SingleLetterAlphabet())
446 """
447
448
449 if string:
450 import cStringIO
451 handle = cStringIO.StringIO(string)
452 else :
453 handle = open_file(filename)
454 try :
455 record = SeqIO.parse(handle, format="fasta").next()
456 except StopIteration :
457 record = None
458 return record
459
461
462
463
464
465 reader = SeqIO.parse(open_file(filename), format="fasta")
466 while True:
467 record = reader.next()
468 if record is None:
469 raise StopIteration
470 else:
471 yield record
472
474 """
475 >>> records = fasta_readrecords('GFF/multi.fna')
476 >>> records[0].id
477 'test1'
478 >>> records[2].seq
479 Seq('AAACACAC', SingleLetterAlphabet())
480 """
481 return list(SeqIO.parse(open_file(filename), format="fasta"))
482
487
489 """
490 >>> record = genbank_single("GFF/NC_001422.gbk")
491 >>> record.taxonomy
492 ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus']
493 >>> cds = record.features[-4]
494 >>> cds.key
495 'CDS'
496 >>> location = LocationFromString(cds.location)
497 >>> print location
498 2931..3917
499 >>> subseq = record_subseq(record, location)
500 >>> subseq[0:20]
501 Seq('ATGTTTGGTGCTATTGCTGG', Alphabet())
502 """
503 return GenBank.RecordParser().parse(open(filename))
504
506 """
507 >>> from Bio.SeqRecord import SeqRecord
508 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
509 ... "ref|NC_001422",
510 ... "Coliphage phiX174, complete genome",
511 ... "bases 1-11")
512 >>> record_subseq(record, LocationFromString("1..4")) # one-based
513 Seq('GAGT', Alphabet())
514 >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based
515 Seq('ACTC', Alphabet())
516 >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea!
517 Seq('ACTCGAGT', Alphabet())
518 >>> loc = LocationFromString("complement(join(complement(5..7),1..4))")
519 >>> print loc
520 complement(join(complement(5..7),1..4))
521 >>> record_subseq(record, loc)
522 Seq('ACTCTTT', Alphabet())
523 >>> print loc
524 complement(join(complement(5..7),1..4))
525 >>> loc.reverse()
526 >>> record_subseq(record, loc)
527 Seq('AAAGAGT', Alphabet())
528 >>> record_subseq(record, loc, upper=1)
529 Seq('AAAGAGT', Alphabet())
530 """
531 if location.join:
532 subsequence_list = []
533 if location.complement:
534 location_copy = copy.copy(location)
535 list.reverse(location_copy)
536 else:
537 location_copy = location
538 for sublocation in location_copy:
539 if location.complement:
540 sublocation_copy = copy.copy(sublocation)
541 sublocation_copy.reverse()
542 else:
543 sublocation_copy = sublocation
544 subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring())
545 return Seq(''.join(subsequence_list), record_sequence(record).alphabet)
546 else:
547 return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)
548
561
563 """
564 >>> from Bio.SeqRecord import SeqRecord
565 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
566 ... "ref|NC_001422",
567 ... "Coliphage phiX174, complete genome",
568 ... "bases 1-11")
569 >>> record_coords(record, 0, 4) # zero-based
570 Seq('GAGT', Alphabet())
571 >>> record_coords(record, 0, 4, "-") # zero-based
572 Seq('ACTC', Alphabet())
573 >>> record_coords(record, 0, 4, "-", upper=1) # zero-based
574 Seq('ACTC', Alphabet())
575 """
576
577 subseq = record_sequence(record)[start:end]
578 subseq_str = subseq.tostring()
579 subseq_str = subseq_str.upper()
580 subseq = Seq(subseq_str, subseq.alphabet)
581 if strand == '-' or strand == 1:
582 return subseq.reverse_complement()
583 else:
584 return subseq
585
586 -def _test(*args, **keywds):
587 import doctest, sys
588 doctest.testmod(sys.modules[__name__], *args, **keywds)
589
590 if __name__ == "__main__":
591 if __debug__:
592 _test()
593