Package Bio :: Package GFF :: Module easy
[hide private]
[frames] | no frames]

Source Code for Module Bio.GFF.easy

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright 2002 by Michael Hoffman.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """ 
  9  Bio.GFF.easy: some functions to ease the use of Biopython 
 10  """ 
 11   
 12  __version__ = "$Revision: 1.12 $" 
 13  # $Source: /home/repository/biopython/biopython/Bio/GFF/easy.py,v $ 
 14   
 15  import copy 
 16  import re 
 17  import sys 
 18   
 19  import Bio 
 20  from Bio import GenBank 
 21  from Bio.Data import IUPACData 
 22  from Bio.Seq import Seq 
 23   
 24  from Bio import SeqIO 
 25  from Bio import SeqUtils 
 26   
 27  import GenericTools 
 28   
29 -class FeatureDict(dict):
30 """ JH: accessing feature.qualifiers as a list is stupid. Here's a dict that does it"""
31 - def __init__(self, feature_list, default=None):
32 dict.__init__(self) 33 self.default = default 34 key_re = re.compile(r'/(\S+)=') 35 36 for i in feature_list: 37 key = key_re.match(i.key).group(1) 38 val = i.value.replace('"','') 39 self[key] = val
40 - def __getitem__(self, key):
41 try: 42 return dict.__getitem__(self, key) 43 except KeyError: 44 return self.default
45
46 -class Location(GenericTools.VerboseList):
47 """ 48 this is really best interfaced through LocationFromString 49 fuzzy: < or > 50 join: {0 = no join, 1 = join, 2 = order} 51 52 >>> location = Location([Location([339]), Location([564])]) # zero-based 53 >>> location 54 Location(Location(339), Location(564)) 55 >>> print location # one-based 56 340..565 57 >>> print location.five_prime() 58 340 59 >>> location_rev = Location([Location([339]), Location([564])], 1) 60 >>> print location_rev 61 complement(340..565) 62 >>> print location_rev.five_prime() 63 565 64 """
65 - def __init__(self, the_list, complement=0, seqname=None):
66 self.complement = complement 67 self.join = 0 68 self.fuzzy = None 69 self.seqname = seqname 70 list.__init__(self, the_list)
71
72 - def _joinstr(self):
73 if self.join == 1: 74 label = 'join' 75 elif self.join == 2: 76 label = 'order' 77 return "%s(%s)" % (label, ",".join(map(str, self)))
78
79 - def __str__(self):
80 if self.seqname: 81 format = "%s:%%s" % self.seqname 82 else: 83 format = "%s" 84 85 if self.complement: 86 format = format % "complement(%s)" 87 88 if self.join: 89 return format % self._joinstr() 90 91 elif isinstance(self[0], list): 92 return format % "%s..%s" % (str(self[0]), str(self[1])) 93 else: 94 if self.fuzzy: 95 format = format % self.fuzzy + "%s" 96 return format % str(self[0] + 1)
97
98 - def __repr__(self):
99 return "Location(%s)" % ", ".join(map(repr, self))
100 101 direction2index = {1: 0, -1: -1}
102 - def direction_and_index(self, direction):
103 """ 104 1: 5' 105 -1: 3' 106 107 >>> loc1 = LocationFromString("join(1..3,complement(5..6))") 108 >>> loc1.direction_and_index(1) 109 (1, 0) 110 >>> loc1.direction_and_index(-1) 111 (-1, -1) 112 >>> loc1.reverse() 113 >>> print loc1 114 complement(join(1..3,complement(5..6))) 115 >>> loc1.direction_and_index(1) 116 (-1, -1) 117 """ 118 if self.complement: 119 direction = direction * -1 120 index = self.direction2index[direction] 121 return direction, index
122
123 - def findside(self, direction):
124 """ 125 >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))") 126 >>> loc.findside(1) 127 Location(5) 128 >>> loc.findside(-1) 129 Location(0) 130 """ 131 direction, index = self.direction_and_index(direction) 132 if self.join or isinstance(self[0], list): 133 return self[index].findside(direction) 134 else: 135 return self
136
137 - def findseqname_3prime(self):
138 """ 139 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))") 140 >>> loc.findseqname_3prime() 141 'MOOCOW' 142 """ 143 return self.findseqname(-1)
144
145 - def findseqname(self, direction=1): # find 5' seqname
146 """ 147 >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))") 148 >>> loc.findseqname() 149 'SEQ' 150 >>> loc.findseqname(-1) 151 'MOOCOW' 152 """ 153 direction, index = self.direction_and_index(direction) 154 if self.seqname: 155 return self.seqname 156 elif self.join: 157 return self[index].findseqname(direction) 158 else: 159 raise AttributeError('no sequence name')
160
161 - def five_prime(self):
162 return self.findside(1)
163 - def three_prime(self):
164 return self.findside(-1)
165
166 - def length(self):
167 """ 168 WARNING: doesn't deal with joins!!!! 169 """ 170 return self.end()-self.start()
171
172 - def intersection(self, other):
173 """ 174 WARNING: doesn't deal with joins!!!! 175 176 >>> location1 = LocationFromString("1..50") 177 >>> location2 = LocationFromString("25..200") 178 >>> print location1.intersection(location2) 179 25..50 180 >>> print location1.intersection(location2) 181 25..50 182 """ 183 if self.start() >= other.start(): 184 start = self.start() 185 else: 186 start = other.start() 187 if self.end() <= other.end(): 188 end = self.end() 189 else: 190 end = other.end() 191 return Location([Location([start]), Location([end])])
192
193 - def start(self):
194 # zero-based 195 if self.complement: 196 return self.three_prime()[0] 197 else: 198 return self.five_prime()[0]
199
200 - def end(self):
201 # zero-based 202 if self.complement: 203 return self.five_prime()[0] 204 else: 205 return self.three_prime()[0]
206
207 - def three_prime_range(self, window):
208 three_prime_loc = self.three_prime() 209 if self.complement: 210 return Location([three_prime_loc-window, three_prime_loc], complement=1) 211 else: 212 return Location([three_prime_loc, three_prime_loc+window])
213
214 - def sublocation(self, sub_location):
215 """ 216 >>> fwd_location = LocationFromString('X:5830132..5831528') 217 >>> print fwd_location.sublocation(LocationFromString('1..101')) 218 X:5830132..5830232 219 >>> print fwd_location.sublocation(LocationFromString('1267..1286')) 220 X:5831398..5831417 221 >>> rev_location = LocationFromString('I:complement(8415686..8416216)') 222 >>> print rev_location.sublocation(LocationFromString('1..101')) 223 I:complement(8416116..8416216) 224 >>> print rev_location.sublocation(LocationFromString('100..200')) 225 I:complement(8416017..8416117) 226 """ 227 228 absolute_location = copy.deepcopy(self) 229 for i in xrange(2): 230 absolute_location[i] = self.five_prime().add(sub_location[i], self.complement) 231 if absolute_location.complement: 232 list.reverse(absolute_location) 233 return absolute_location
234
235 - def __add__(self, addend):
236 return self.add(addend)
237
238 - def add(self, addend, complement=0):
239 self_copy = copy.deepcopy(self) 240 if isinstance(addend, Location): 241 addend = addend[0] 242 if complement: 243 addend *= -1 244 self_copy[0] += addend 245 return self_copy
246
247 - def __sub__(self, subtrahend):
248 return self + -subtrahend
249
250 - def reverse(self):
251 self.complement = [1, 0][self.complement]
252
253 - def reorient(self):
254 """ 255 >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))") 256 >>> loc1.reorient() 257 >>> print loc1 258 complement(join(I:1..9000,I:9001..10000)) 259 >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)") 260 >>> loc2.reorient() 261 >>> print loc2 262 join(I:complement(1..9000),I:9001..10000) 263 """ 264 if self.join: 265 if len([x for x in self if x.complement]) == len(self): 266 self.reverse() 267 for segment in self: 268 segment.reverse()
269
270 - def bounding(self):
271 """ 272 works for single level non-complex joins 273 274 >>> LOC = LocationFromString 275 >>> l1 = LOC("join(alpha:1..30,alpha:50..70)") 276 >>> print l1.bounding() 277 join(alpha:1..70) 278 >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))") 279 >>> print l2.bounding() 280 join(alpha:1..30,alpha:complement(50..70)) 281 >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)") 282 >>> print l3.bounding() 283 join(alpha:1..45,alpha:complement(50..70),beta:6..20) 284 285 """ 286 if not self.join: 287 return 288 289 seqdict = {} 290 seqkeys = [] 291 for subloc in self: 292 assert subloc.seqname 293 assert not subloc.join 294 try: 295 seqdict[_hashname(subloc)].append(subloc) 296 except KeyError: 297 key = _hashname(subloc) 298 seqdict[key] = [subloc] 299 seqkeys.append(key) 300 301 res = LocationJoin() 302 for key in seqkeys: 303 locations = seqdict[key] 304 coords = [] 305 for subloc in locations: 306 coords.append(subloc.start()) 307 coords.append(subloc.end()) 308 res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname)) 309 return res
310
311 -def _hashname(location):
312 return str(location.complement) + location.seqname
313
314 -class LocationJoin(Location):
315 """ 316 >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")]) 317 >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))") 318 >>> join.append(appendloc) 319 >>> print join 320 join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55))) 321 >>> join2 = LocationJoin() 322 >>> join2.append(LocationFromString("complement(66..99)")) 323 >>> join2.append(LocationFromString("complement(5..55)")) 324 >>> print join2 325 join(complement(66..99),complement(5..55)) 326 """
327 - def __init__(self, the_list = [], complement=0, seqname=None):
328 self.complement = complement 329 self.join = 1 330 self.fuzzy = None 331 self.seqname = seqname 332 list.__init__(self, the_list)
333
334 -class LocationFromCoords(Location):
335 """ 336 >>> print LocationFromCoords(339, 564) 337 340..565 338 >>> print LocationFromCoords(339, 564, seqname="I") 339 I:340..565 340 >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434") 341 NC_343434:complement(1000..3235) 342 """
343 - def __init__(self, start, end, strand=0, seqname=None):
344 if strand == "+": 345 strand = 0 346 elif strand == "-": 347 strand = 1 348 Location.__init__(self, [Location([start]), Location([end])], strand, seqname)
349 350 # see http://www.ncbi.nlm.nih.gov/collab/FT/index.html#backus-naur 351 # for how this should actually be implemented 352 re_complement = re.compile(r"^complement\((.*)\)$") 353 re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$") # not every character is allowed by spec 354 re_join = re.compile(r"^(join|order)\((.*)\)$") 355 re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$") 356 re_fuzzy = re.compile(r"^([><])(\d+)")
357 -class LocationFromString(Location):
358 """ 359 >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location 360 >>> print LocationFromString("467") 361 467 362 >>> print LocationFromString("340..565") 363 340..565 364 >>> print LocationFromString("<345..500") 365 <345..500 366 >>> print LocationFromString("<1..888") 367 <1..888 368 >>> # (102.110) and 123^124 syntax unimplemented 369 >>> print LocationFromString("join(12..78,134..202)") 370 join(12..78,134..202) 371 >>> print LocationFromString("complement(join(2691..4571,4918..5163))") 372 complement(join(2691..4571,4918..5163)) 373 >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))") 374 join(complement(4918..5163),complement(2691..4571)) 375 >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))") 376 order(complement(4918..5163),complement(2691..4571)) 377 >>> print LocationFromString("NC_001802x.fna:73..78") 378 NC_001802x.fna:73..78 379 >>> print LocationFromString("J00194:100..202") 380 J00194:100..202 381 382 >>> print LocationFromString("join(117505..118584,1..609)") 383 join(117505..118584,1..609) 384 >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))") 385 join(test3:complement(4..6),test3:complement(1..3)) 386 >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)") 387 test3:join(test1:complement(1..3),4..6) 388 """
389 - def __init__(self, location_str):
390 match_seqname = re_seqname.match(location_str) 391 if match_seqname: 392 self.seqname = match_seqname.group(1) 393 location_str = match_seqname.group(2) 394 else: 395 self.seqname = None 396 match_complement = re_complement.match(location_str) 397 if match_complement: 398 self.complement = 1 399 location_str = match_complement.group(1) 400 else: 401 self.complement = 0 402 match_join = re_join.match(location_str) 403 if match_join: 404 self.join = {'join':1, 'order':2}[match_join.group(1)] 405 list.__init__(self, map(lambda x: LocationFromString(x), match_join.group(2).split(","))) 406 else: 407 self.join = 0 408 match_dotdot = re_dotdot.match(location_str) 409 if match_dotdot: 410 list.__init__(self, map(lambda x: LocationFromString(match_dotdot.group(x)), (1, 2))) 411 else: 412 match_fuzzy = re_fuzzy.match(location_str) 413 if match_fuzzy: 414 self.fuzzy = match_fuzzy.group(1) 415 location_str = match_fuzzy.group(2) 416 else: 417 self.fuzzy = None 418 419 list.__init__(self, [int(location_str)-1]) # zero based, nip it in the bud
420
421 -def open_file(filename):
422 if filename: 423 return open(filename) 424 else: 425 return sys.stdin
426
427 -def fasta_single(filename=None, string=None):
428 """ 429 >>> record = fasta_single(string=\""" 430 ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1] 431 ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT 432 ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG 433 ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA 434 ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT 435 ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC 436 ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG 437 ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR 438 ... SLFGNDPSSQ 439 ... \""") 440 >>> record.id 441 'gi|9629360|ref|NP_057850.1|' 442 >>> record.description 443 'gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]' 444 >>> record.seq[0:5] 445 Seq('MGARA', SingleLetterAlphabet()) 446 """ 447 #Returns the first record in a fasta file as a SeqRecord, 448 #or None if there are no records in the file. 449 if string: 450 import cStringIO 451 handle = cStringIO.StringIO(string) 452 else : 453 handle = open_file(filename) 454 try : 455 record = SeqIO.parse(handle, format="fasta").next() 456 except StopIteration : 457 record = None 458 return record
459
460 -def fasta_multi(filename=None):
461 #Simple way is just: 462 #return SeqIO.parse(open_file(filename), format="fasta") 463 #However, for backwards compatibility make sure we raise 464 #the StopIteration exception rather than returning None. 465 reader = SeqIO.parse(open_file(filename), format="fasta") 466 while True: 467 record = reader.next() 468 if record is None: 469 raise StopIteration 470 else: 471 yield record
472
473 -def fasta_readrecords(filename=None):
474 """ 475 >>> records = fasta_readrecords('GFF/multi.fna') 476 >>> records[0].id 477 'test1' 478 >>> records[2].seq 479 Seq('AAACACAC', SingleLetterAlphabet()) 480 """ 481 return list(SeqIO.parse(open_file(filename), format="fasta"))
482
483 -def fasta_write(filename, records):
484 handle = open(filename, "w") 485 SeqIO.write(records, handle, format="fasta") 486 handle.close()
487
488 -def genbank_single(filename):
489 """ 490 >>> record = genbank_single("GFF/NC_001422.gbk") 491 >>> record.taxonomy 492 ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus'] 493 >>> cds = record.features[-4] 494 >>> cds.key 495 'CDS' 496 >>> location = LocationFromString(cds.location) 497 >>> print location 498 2931..3917 499 >>> subseq = record_subseq(record, location) 500 >>> subseq[0:20] 501 Seq('ATGTTTGGTGCTATTGCTGG', Alphabet()) 502 """ 503 return GenBank.RecordParser().parse(open(filename))
504
505 -def record_subseq(record, location, *args, **keywds):
506 """ 507 >>> from Bio.SeqRecord import SeqRecord 508 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"), 509 ... "ref|NC_001422", 510 ... "Coliphage phiX174, complete genome", 511 ... "bases 1-11") 512 >>> record_subseq(record, LocationFromString("1..4")) # one-based 513 Seq('GAGT', Alphabet()) 514 >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based 515 Seq('ACTC', Alphabet()) 516 >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea! 517 Seq('ACTCGAGT', Alphabet()) 518 >>> loc = LocationFromString("complement(join(complement(5..7),1..4))") 519 >>> print loc 520 complement(join(complement(5..7),1..4)) 521 >>> record_subseq(record, loc) 522 Seq('ACTCTTT', Alphabet()) 523 >>> print loc 524 complement(join(complement(5..7),1..4)) 525 >>> loc.reverse() 526 >>> record_subseq(record, loc) 527 Seq('AAAGAGT', Alphabet()) 528 >>> record_subseq(record, loc, upper=1) 529 Seq('AAAGAGT', Alphabet()) 530 """ 531 if location.join: 532 subsequence_list = [] 533 if location.complement: 534 location_copy = copy.copy(location) 535 list.reverse(location_copy) 536 else: 537 location_copy = location 538 for sublocation in location_copy: 539 if location.complement: 540 sublocation_copy = copy.copy(sublocation) 541 sublocation_copy.reverse() 542 else: 543 sublocation_copy = sublocation 544 subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring()) 545 return Seq(''.join(subsequence_list), record_sequence(record).alphabet) 546 else: 547 return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)
548
549 -def record_sequence(record):
550 """ 551 returns the sequence of a record 552 553 can be Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record 554 """ 555 if isinstance(record, Bio.SeqRecord.SeqRecord): 556 return record.seq 557 elif isinstance(record, Bio.GenBank.Record.Record): 558 return Seq(record.sequence) 559 else: 560 raise TypeError('not Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record')
561
562 -def record_coords(record, start, end, strand=0, upper=0):
563 """ 564 >>> from Bio.SeqRecord import SeqRecord 565 >>> record = SeqRecord(Seq("gagttttatcgcttccatga"), 566 ... "ref|NC_001422", 567 ... "Coliphage phiX174, complete genome", 568 ... "bases 1-11") 569 >>> record_coords(record, 0, 4) # zero-based 570 Seq('GAGT', Alphabet()) 571 >>> record_coords(record, 0, 4, "-") # zero-based 572 Seq('ACTC', Alphabet()) 573 >>> record_coords(record, 0, 4, "-", upper=1) # zero-based 574 Seq('ACTC', Alphabet()) 575 """ 576 577 subseq = record_sequence(record)[start:end] 578 subseq_str = subseq.tostring() 579 subseq_str = subseq_str.upper() 580 subseq = Seq(subseq_str, subseq.alphabet) 581 if strand == '-' or strand == 1: 582 return subseq.reverse_complement() 583 else: 584 return subseq
585
586 -def _test(*args, **keywds):
587 import doctest, sys 588 doctest.testmod(sys.modules[__name__], *args, **keywds)
589 590 if __name__ == "__main__": 591 if __debug__: 592 _test() 593