Package Bio :: Package GFF :: Module easy
[hide private]
[frames] | no frames]

Source Code for Module Bio.GFF.easy

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright 2002 by Michael Hoffman.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """ 
  9  Bio.GFF.easy: some functions to ease the use of Biopython 
 10  """ 
 11   
 12  from __future__ import generators # requires Python 2.2 
 13   
 14  __version__ = "$Revision: 1.9 $" 
 15  # $Source: /home/repository/biopython/biopython/Bio/GFF/easy.py,v $ 
 16   
 17  import copy 
 18  import re 
 19  import string 
 20  import sys 
 21   
 22  import Bio 
 23  from Bio import GenBank 
 24  from Bio.Data import IUPACData 
 25  from Bio.Seq import Seq 
 26   
 27  from Bio import SeqIO 
 28  from Bio import SeqUtils 
 29   
 30  import GenericTools 
 31   
32 -class FeatureDict(dict):
33 """ JH: accessing feature.qualifiers as a list is stupid. Here's a dict that does it"""
34 - def __init__(self, feature_list, default=None):
35 dict.__init__(self) 36 self.default = default 37 key_re = re.compile(r'/(\S+)=') 38 39 for i in feature_list: 40 key = key_re.match(i.key).group(1) 41 val = string.replace(i.value,'"','') 42 self[key] = val
43 - def __getitem__(self, key):
44 try: 45 return dict.__getitem__(self, key) 46 except KeyError: 47 return self.default
48
49 -class Location(GenericTools.VerboseList):
50 """ 51 this is really best interfaced through LocationFromString 52 fuzzy: < or > 53 join: {0 = no join, 1 = join, 2 = order} 54 55 >>> location = Location([Location([339]), Location([564])]) # zero-based 56 >>> location 57 Location(Location(339), Location(564)) 58 >>> print location # one-based 59 340..565 60 >>> print location.five_prime() 61 340 62 >>> location_rev = Location([Location([339]), Location([564])], 1) 63 >>> print location_rev 64 complement(340..565) 65 >>> print location_rev.five_prime() 66 565 67 """
68 - def __init__(self, the_list, complement=0, seqname=None):
69 self.complement = complement 70 self.join = 0 71 self.fuzzy = None 72 self.seqname = seqname 73 list.__init__(self, the_list)
74
75 - def _joinstr(self):
76 if self.join == 1: 77 label = 'join' 78 elif self.join == 2: 79 label = 'order' 80 return "%s(%s)" % (label, ",".join(map(str, self)))
81
82 - def __str__(self):
83 if self.seqname: 84 format = "%s:%%s" % self.seqname 85 else: 86 format = "%s" 87 88 if self.complement: 89 format = format % "complement(%s)" 90 91 if self.join: 92 return format % self._joinstr() 93 94 elif isinstance(self[0], list): 95 return format % "%s..%s" % (str(self[0]), str(self[1])) 96 else: 97 if self.fuzzy: 98 format = format % self.fuzzy + "%s" 99 return format % str(self[0] + 1)
100
101 - def __repr__(self):
102 return "Location(%s)" % ", ".join(map(repr, self))
103 104 direction2index = {1: 0, -1: -1}
105 - def direction_and_index(self, direction):
106 """ 107 1: 5' 108 -1: 3' 109 110 >>> loc1 = LocationFromString("join(1..3,complement(5..6))") 111 >>> loc1.direction_and_index(1) 112 (1, 0) 113 >>> loc1.direction_and_index(-1) 114 (-1, -1) 115 >>> loc1.reverse() 116 >>> print loc1 117 complement(join(1..3,complement(5..6))) 118 >>> loc1.direction_and_index(1) 119 (-1, -1) 120 """ 121 if self.complement: 122 direction = direction * -1 123 index = self.direction2index[direction] 124 return direction, index
125
126 - def findside(self, direction):
127 """ 128 >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))") 129 >>> loc.findside(1) 130 Location(5) 131 >>> loc.findside(-1) 132 Location(0) 133 """ 134 direction, index = self.direction_and_index(direction) 135 if self.join or isinstance(self[0], list): 136 return self[index].findside(direction) 137 else: 138 return self
139
140 - def findseqname_3prime(self):
141 """ 142 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))") 143 >>> loc.findseqname_3prime() 144 'MOOCOW' 145 """ 146 return self.findseqname(-1)
147
148 - def findseqname(self, direction=1): # find 5' seqname
149 """ 150 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))") 151 >>> loc.findseqname() 152 'SEQ' 153 >>> loc.findseqname(-1) 154 'MOOCOW' 155 """ 156 direction, index = self.direction_and_index(direction) 157 if self.seqname: 158 return self.seqname 159 elif self.join: 160 return self[index].findseqname(direction) 161 else: 162 raise AttributeError, 'no sequence name'
163
164 - def five_prime(self):
165 return self.findside(1)
166 - def three_prime(self):
167 return self.findside(-1)
168
169 - def length(self):
170 """ 171 WARNING: doesn't deal with joins!!!! 172 """ 173 return self.end()-self.start()
174
175 - def intersection(self, other):
176 """ 177 WARNING: doesn't deal with joins!!!! 178 179 >>> location1 = LocationFromString("1..50") 180 >>> location2 = LocationFromString("25..200") 181 >>> print location1.intersection(location2) 182 25..50 183 >>> print location1.intersection(location2) 184 25..50 185 """ 186 if self.start() >= other.start(): 187 start = self.start() 188 else: 189 start = other.start() 190 if self.end() <= other.end(): 191 end = self.end() 192 else: 193 end = other.end() 194 return Location([Location([start]), Location([end])])
195
196 - def start(self):
197 # zero-based 198 if self.complement: 199 return self.three_prime()[0] 200 else: 201 return self.five_prime()[0]
202
203 - def end(self):
204 # zero-based 205 if self.complement: 206 return self.five_prime()[0] 207 else: 208 return self.three_prime()[0]
209
210 - def three_prime_range(self, window):
211 three_prime_loc = self.three_prime() 212 if self.complement: 213 return Location([three_prime_loc-window, three_prime_loc], complement=1) 214 else: 215 return Location([three_prime_loc, three_prime_loc+window])
216
217 - def sublocation(self, sub_location):
218 """ 219 >>> fwd_location = LocationFromString('X:5830132..5831528') 220 >>> print fwd_location.sublocation(LocationFromString('1..101')) 221 X:5830132..5830232 222 >>> print fwd_location.sublocation(LocationFromString('1267..1286')) 223 X:5831398..5831417 224 >>> rev_location = LocationFromString('I:complement(8415686..8416216)') 225 >>> print rev_location.sublocation(LocationFromString('1..101')) 226 I:complement(8416116..8416216) 227 >>> print rev_location.sublocation(LocationFromString('100..200')) 228 I:complement(8416017..8416117) 229 """ 230 231 absolute_location = copy.deepcopy(self) 232 for i in xrange(2): 233 absolute_location[i] = self.five_prime().add(sub_location[i], self.complement) 234 if absolute_location.complement: 235 list.reverse(absolute_location) 236 return absolute_location
237
238 - def __add__(self, addend):
239 return self.add(addend)
240
241 - def add(self, addend, complement=0):
242 self_copy = copy.deepcopy(self) 243 if isinstance(addend, Location): 244 addend = addend[0] 245 if complement: 246 addend *= -1 247 self_copy[0] += addend 248 return self_copy
249
250 - def __sub__(self, subtrahend):
251 return self + -subtrahend
252
253 - def reverse(self):
254 self.complement = [1, 0][self.complement]
255
256 - def reorient(self):
257 """ 258 >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))") 259 >>> loc1.reorient() 260 >>> print loc1 261 complement(join(I:1..9000,I:9001..10000)) 262 >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)") 263 >>> loc2.reorient() 264 >>> print loc2 265 join(I:complement(1..9000),I:9001..10000) 266 """ 267 if self.join: 268 if len([x for x in self if x.complement]) == len(self): 269 self.reverse() 270 for segment in self: 271 segment.reverse()
272
273 - def bounding(self):
274 """ 275 works for single level non-complex joins 276 277 >>> LOC = LocationFromString 278 >>> l1 = LOC("join(alpha:1..30,alpha:50..70)") 279 >>> print l1.bounding() 280 join(alpha:1..70) 281 >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))") 282 >>> print l2.bounding() 283 join(alpha:1..30,alpha:complement(50..70)) 284 >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)") 285 >>> print l3.bounding() 286 join(alpha:1..45,alpha:complement(50..70),beta:6..20) 287 288 """ 289 if not self.join: 290 return 291 292 seqdict = {} 293 seqkeys = [] 294 for subloc in self: 295 assert subloc.seqname 296 assert not subloc.join 297 try: 298 seqdict[_hashname(subloc)].append(subloc) 299 except KeyError: 300 key = _hashname(subloc) 301 seqdict[key] = [subloc] 302 seqkeys.append(key) 303 304 res = LocationJoin() 305 for key in seqkeys: 306 locations = seqdict[key] 307 coords = [] 308 for subloc in locations: 309 coords.append(subloc.start()) 310 coords.append(subloc.end()) 311 res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname)) 312 return res
313
314 -def _hashname(location):
315 return str(location.complement) + location.seqname
316
317 -class LocationJoin(Location):
318 """ 319 >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")]) 320 >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))") 321 >>> join.append(appendloc) 322 >>> print join 323 join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55))) 324 >>> join2 = LocationJoin() 325 >>> join2.append(LocationFromString("complement(66..99)")) 326 >>> join2.append(LocationFromString("complement(5..55)")) 327 >>> print join2 328 join(complement(66..99),complement(5..55)) 329 """
330 - def __init__(self, the_list = [], complement=0, seqname=None):
331 self.complement = complement 332 self.join = 1 333 self.fuzzy = None 334 self.seqname = seqname 335 list.__init__(self, the_list)
336
337 -class LocationFromCoords(Location):
338 """ 339 >>> print LocationFromCoords(339, 564) 340 340..565 341 >>> print LocationFromCoords(339, 564, seqname="I") 342 I:340..565 343 >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434") 344 NC_343434:complement(1000..3235) 345 """
346 - def __init__(self, start, end, strand=0, seqname=None):
347 if strand == "+": 348 strand = 0 349 elif strand == "-": 350 strand = 1 351 Location.__init__(self, [Location([start]), Location([end])], strand, seqname)
352 353 # see http://www.ncbi.nlm.nih.gov/collab/FT/index.html#backus-naur 354 # for how this should actually be implemented 355 re_complement = re.compile(r"^complement\((.*)\)$") 356 re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$") # not every character is allowed by spec 357 re_join = re.compile(r"^(join|order)\((.*)\)$") 358 re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$") 359 re_fuzzy = re.compile(r"^([><])(\d+)")
360 -class LocationFromString(Location):
361 """ 362 >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location 363 >>> print LocationFromString("467") 364 467 365 >>> print LocationFromString("340..565") 366 340..565 367 >>> print LocationFromString("<345..500") 368 <345..500 369 >>> print LocationFromString("<1..888") 370 <1..888 371 >>> # (102.110) and 123^124 syntax unimplemented 372 >>> print LocationFromString("join(12..78,134..202)") 373 join(12..78,134..202) 374 >>> print LocationFromString("complement(join(2691..4571,4918..5163))") 375 complement(join(2691..4571,4918..5163)) 376 >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))") 377 join(complement(4918..5163),complement(2691..4571)) 378 >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))") 379 order(complement(4918..5163),complement(2691..4571)) 380 >>> print LocationFromString("NC_001802x.fna:73..78") 381 NC_001802x.fna:73..78 382 >>> print LocationFromString("J00194:100..202") 383 J00194:100..202 384 385 >>> print LocationFromString("join(117505..118584,1..609)") 386 join(117505..118584,1..609) 387 >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))") 388 join(test3:complement(4..6),test3:complement(1..3)) 389 >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)") 390 test3:join(test1:complement(1..3),4..6) 391 """
392 - def __init__(self, location_str):
393 match_seqname = re_seqname.match(location_str) 394 if match_seqname: 395 self.seqname = match_seqname.group(1) 396 location_str = match_seqname.group(2) 397 else: 398 self.seqname = None 399 match_complement = re_complement.match(location_str) 400 if match_complement: 401 self.complement = 1 402 location_str = match_complement.group(1) 403 else: 404 self.complement = 0 405 match_join = re_join.match(location_str) 406 if match_join: 407 self.join = {'join':1, 'order':2}[match_join.group(1)] 408 list.__init__(self, map(lambda x: LocationFromString(x), match_join.group(2).split(","))) 409 else: 410 self.join = 0 411 match_dotdot = re_dotdot.match(location_str) 412 if match_dotdot: 413 list.__init__(self, map(lambda x: LocationFromString(match_dotdot.group(x)), (1, 2))) 414 else: 415 match_fuzzy = re_fuzzy.match(location_str) 416 if match_fuzzy: 417 self.fuzzy = match_fuzzy.group(1) 418 location_str = match_fuzzy.group(2) 419 else: 420 self.fuzzy = None 421 422 list.__init__(self, [int(location_str)-1]) # zero based, nip it in the bud
423
424 -def open_file(filename):
425 if filename: 426 return open(filename) 427 else: 428 return sys.stdin
429
430 -def fasta_single(filename=None, string=None):
431 """ 432 >>> record = fasta_single(string=\""" 433 ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1] 434 ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT 435 ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG 436 ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA 437 ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT 438 ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC 439 ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG 440 ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR 441 ... SLFGNDPSSQ 442 ... \""") 443 >>> record.id 444 'gi|9629360|ref|NP_057850.1|' 445 >>> record.description 446 'gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]' 447 >>> record.seq[0:5] 448 Seq('MGARA', SingleLetterAlphabet()) 449 """ 450 #Returns the first record in a fasta file as a SeqRecord, 451 #or None if there are no records in the file. 452 if string: 453 import cStringIO 454 handle = cStringIO.StringIO(string) 455 else : 456 handle = open_file(filename) 457 try : 458 record = SeqIO.parse(handle, format="fasta").next() 459 except StopIteration : 460 record = None 461 return record
462
463 -def fasta_multi(filename=None):
464 #Simple way is just: 465 #return SeqIO.parse(open_file(filename), format="fasta") 466 #However, for backwards compatibility make sure we raise 467 #the StopIteration exception rather than returning None. 468 reader = SeqIO.parse(open_file(filename), format="fasta") 469 while True: 470 record = reader.next() 471 if record is None: 472 raise StopIteration 473 else: 474 yield record
475
476 -def fasta_readrecords(filename=None):
477 """ 478 >>> records = fasta_readrecords('GFF/multi.fna') 479 >>> records[0].id 480 'test1' 481 >>> records[2].seq 482 Seq('AAACACAC', SingleLetterAlphabet()) 483 """ 484 return list(SeqIO.parse(open_file(filename), format="fasta"))
485
486 -def fasta_write(filename, records):
487 handle = open(filename, "w") 488 SeqIO.write(records, handle, format="fasta") 489 handle.close()
490
491 -def genbank_single(filename):
492 """ 493 >>> record = genbank_single("GFF/NC_001422.gbk") 494 >>> record.taxonomy 495 ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus'] 496 >>> cds = record.features[-4] 497 >>> cds.key 498 'CDS' 499 >>> location = LocationFromString(cds.location) 500 >>> print location 501 2931..3917 502 >>> subseq = record_subseq(record, location) 503 >>> subseq[0:20] 504 Seq('ATGTTTGGTGCTATTGCTGG', Alphabet()) 505 """ 506 return GenBank.RecordParser().parse(open(filename))
507
508 -def record_subseq(record, location, *args, **keywds):
509 """ 510 >>> from Bio.SeqRecord import SeqRecord 511 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"), 512 ... "ref|NC_001422", 513 ... "Coliphage phiX174, complete genome", 514 ... "bases 1-11") 515 >>> record_subseq(record, LocationFromString("1..4")) # one-based 516 Seq('GAGT', Alphabet()) 517 >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based 518 Seq('ACTC', Alphabet()) 519 >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea! 520 Seq('ACTCGAGT', Alphabet()) 521 >>> loc = LocationFromString("complement(join(complement(5..7),1..4))") 522 >>> print loc 523 complement(join(complement(5..7),1..4)) 524 >>> record_subseq(record, loc) 525 Seq('ACTCTTT', Alphabet()) 526 >>> print loc 527 complement(join(complement(5..7),1..4)) 528 >>> loc.reverse() 529 >>> record_subseq(record, loc) 530 Seq('AAAGAGT', Alphabet()) 531 >>> record_subseq(record, loc, upper=1) 532 Seq('AAAGAGT', Alphabet()) 533 """ 534 if location.join: 535 subsequence_list = [] 536 if location.complement: 537 location_copy = copy.copy(location) 538 list.reverse(location_copy) 539 else: 540 location_copy = location 541 for sublocation in location_copy: 542 if location.complement: 543 sublocation_copy = copy.copy(sublocation) 544 sublocation_copy.reverse() 545 else: 546 sublocation_copy = sublocation 547 subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring()) 548 return Seq(''.join(subsequence_list), record_sequence(record).alphabet) 549 else: 550 return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)
551
552 -def record_sequence(record):
553 """ 554 returns the sequence of a record 555 556 can be Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record 557 """ 558 if isinstance(record, Bio.SeqRecord.SeqRecord): 559 return record.seq 560 elif isinstance(record, Bio.GenBank.Record.Record): 561 return Seq(record.sequence) 562 else: 563 raise TypeError, 'not Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record'
564
565 -def record_coords(record, start, end, strand=0, upper=0):
566 """ 567 >>> from Bio.SeqRecord import SeqRecord 568 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"), 569 ... "ref|NC_001422", 570 ... "Coliphage phiX174, complete genome", 571 ... "bases 1-11") 572 >>> record_coords(record, 0, 4) # zero-based 573 Seq('GAGT', Alphabet()) 574 >>> record_coords(record, 0, 4, "-") # zero-based 575 Seq('ACTC', Alphabet()) 576 >>> record_coords(record, 0, 4, "-", upper=1) # zero-based 577 Seq('ACTC', Alphabet()) 578 """ 579 580 subseq = record_sequence(record)[start:end] 581 subseq_str = subseq.tostring() 582 subseq_str = subseq_str.upper() 583 subseq = Seq(subseq_str, subseq.alphabet) 584 if strand == '-' or strand == 1: 585 return subseq.reverse_complement() 586 else: 587 return subseq
588
589 -def _test(*args, **keywds):
590 import doctest, sys 591 doctest.testmod(sys.modules[__name__], *args, **keywds)
592 593 if __name__ == "__main__": 594 if __debug__: 595 _test() 596