Package Bio :: Package SwissProt :: Module SProt
[hide private]
[frames] | no frames]

Source Code for Module Bio.SwissProt.SProt

   1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """ 
   7  This module provides code to work with the sprotXX.dat file from 
   8  Utilities for working with FASTA-formatted sequences (OBSOLETE). 
   9  http://www.expasy.ch/sprot/sprot-top.html 
  10   
  11  Please see Bio.SwissProt for alternatives for the functionality in this module. 
  12   
  13  Tested with: 
  14  Release 37, Release 38, Release 39 
  15   
  16  Limited testing with: 
  17  Release 51, 54 
  18   
  19   
  20  Classes: 
  21  Record             Holds SwissProt data. 
  22  Reference          Holds reference data from a SwissProt entry. 
  23  Iterator           Iterates over entries in a SwissProt file. 
  24  Dictionary         Accesses a SwissProt file using a dictionary interface. 
  25  RecordParser       Parses a SwissProt record into a Record object. 
  26  SequenceParser     Parses a SwissProt record into a SeqRecord object. 
  27   
  28  _Scanner           Scans SwissProt-formatted data. 
  29  _RecordConsumer    Consumes SwissProt data to a SProt.Record object. 
  30  _SequenceConsumer  Consumes SwissProt data to a SeqRecord object. 
  31   
  32   
  33  Functions: 
  34  index_file         Index a SwissProt file for a Dictionary. 
  35   
  36  """ 
  37  from types import * 
  38  import os 
  39  from Bio import File 
  40  from Bio import Index 
  41  from Bio import Alphabet 
  42  from Bio import Seq 
  43  from Bio import SeqRecord 
  44  from Bio.ParserSupport import * 
  45   
  46  # The parse(), read() functions can probably be simplified if we don't 
  47  # use the "parser = RecordParser(); parser.parse(handle)" approach. 
48 -def parse(handle):
49 from SProt import RecordParser 50 import cStringIO 51 parser = RecordParser() 52 text = "" 53 for line in handle: 54 text += line 55 if line[:2]=='//': 56 handle = cStringIO.StringIO(text) 57 record = parser.parse(handle) 58 text = "" 59 yield record
60
61 -def read(handle):
62 from SProt import RecordParser 63 parser = RecordParser() 64 try: 65 record = parser.parse(handle) 66 except ValueError, error: 67 if error.message.startswith("Line does not start with 'ID':"): 68 raise ValueError("No SwissProt record found") 69 else: 70 raise error 71 # We should have reached the end of the record by now 72 remainder = handle.read() 73 if remainder: 74 raise ValueError("More than one SwissProt record found") 75 return record
76 77 78 _CHOMP = " \n\r\t.,;" #whitespace and trailing punctuation 79
80 -class Record:
81 """Holds information from a SwissProt record. 82 83 Members: 84 entry_name Name of this entry, e.g. RL1_ECOLI. 85 data_class Either 'STANDARD' or 'PRELIMINARY'. 86 molecule_type Type of molecule, 'PRT', 87 sequence_length Number of residues. 88 89 accessions List of the accession numbers, e.g. ['P00321'] 90 created A tuple of (date, release). 91 sequence_update A tuple of (date, release). 92 annotation_update A tuple of (date, release). 93 94 description Free-format description. 95 gene_name Gene name. See userman.txt for description. 96 organism The source of the sequence. 97 organelle The origin of the sequence. 98 organism_classification The taxonomy classification. List of strings. 99 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 100 taxonomy_id A list of NCBI taxonomy id's. 101 host_organism A list of NCBI taxonomy id's of the hosts of a virus, 102 if any. 103 references List of Reference objects. 104 comments List of strings. 105 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 106 keywords List of the keywords. 107 features List of tuples (key name, from, to, description). 108 from and to can be either integers for the residue 109 numbers, '<', '>', or '?' 110 111 seqinfo tuple of (length, molecular weight, CRC32 value) 112 sequence The sequence. 113 114 """
115 - def __init__(self):
116 self.entry_name = None 117 self.data_class = None 118 self.molecule_type = None 119 self.sequence_length = None 120 121 self.accessions = [] 122 self.created = None 123 self.sequence_update = None 124 self.annotation_update = None 125 126 self.description = '' 127 self.gene_name = '' 128 self.organism = '' 129 self.organelle = '' 130 self.organism_classification = [] 131 self.taxonomy_id = [] 132 self.host_organism = [] 133 self.references = [] 134 self.comments = [] 135 self.cross_references = [] 136 self.keywords = [] 137 self.features = [] 138 139 self.seqinfo = None 140 self.sequence = ''
141
142 -class Reference:
143 """Holds information from 1 references in a SwissProt entry. 144 145 Members: 146 number Number of reference in an entry. 147 positions Describes extent of work. list of strings. 148 comments Comments. List of (token, text). 149 references References. List of (dbname, identifier) 150 authors The authors of the work. 151 title Title of the work. 152 location A citation for the work. 153 154 """
155 - def __init__(self):
156 self.number = None 157 self.positions = [] 158 self.comments = [] 159 self.references = [] 160 self.authors = '' 161 self.title = '' 162 self.location = ''
163
164 -class Iterator:
165 """Returns one record at a time from a SwissProt file. 166 167 Methods: 168 next Return the next record from the stream, or None. 169 170 """
171 - def __init__(self, handle, parser=None):
172 """__init__(self, handle, parser=None) 173 174 Create a new iterator. handle is a file-like object. parser 175 is an optional Parser object to change the results into another form. 176 If set to None, then the raw contents of the file will be returned. 177 178 """ 179 import warnings 180 warnings.warn("Bio.SwissProt.SProt.Iterator is deprecated. Please use the function Bio.SwissProt.parse instead if you want to get a SwissProt.SProt.Record, or Bio.SeqIO.parse if you want to get a SeqRecord. If these solutions do not work for you, please get in contact with the Biopython developers (biopython-dev@biopython.org).", 181 DeprecationWarning) 182 183 if type(handle) is not FileType and type(handle) is not InstanceType: 184 raise ValueError("I expected a file handle or file-like object") 185 self._uhandle = File.UndoHandle(handle) 186 self._parser = parser
187
188 - def next(self):
189 """next(self) -> object 190 191 Return the next swissprot record from the file. If no more records, 192 return None. 193 194 """ 195 lines = [] 196 while 1: 197 line = self._uhandle.readline() 198 if not line: 199 break 200 lines.append(line) 201 if line[:2] == '//': 202 break 203 204 if not lines: 205 return None 206 207 data = ''.join(lines) 208 if self._parser is not None: 209 return self._parser.parse(File.StringHandle(data)) 210 return data
211
212 - def __iter__(self):
213 return iter(self.next, None)
214
215 -class Dictionary:
216 """Accesses a SwissProt file using a dictionary interface. 217 218 """ 219 __filename_key = '__filename' 220
221 - def __init__(self, indexname, parser=None):
222 """__init__(self, indexname, parser=None) 223 224 Open a SwissProt Dictionary. indexname is the name of the 225 index for the dictionary. The index should have been created 226 using the index_file function. parser is an optional Parser 227 object to change the results into another form. If set to None, 228 then the raw contents of the file will be returned. 229 230 """ 231 self._index = Index.Index(indexname) 232 self._handle = open(self._index[self.__filename_key]) 233 self._parser = parser
234
235 - def __len__(self):
236 return len(self._index)
237
238 - def __getitem__(self, key):
239 start, len = self._index[key] 240 self._handle.seek(start) 241 data = self._handle.read(len) 242 if self._parser is not None: 243 return self._parser.parse(File.StringHandle(data)) 244 return data
245
246 - def __getattr__(self, name):
247 return getattr(self._index, name)
248
249 - def keys(self):
250 # I only want to expose the keys for SwissProt. 251 k = self._index.keys() 252 k.remove(self.__filename_key) 253 return k
254 255
256 -class RecordParser(AbstractParser):
257 """Parses SwissProt data into a Record object. 258 259 """
260 - def __init__(self):
261 self._scanner = _Scanner() 262 self._consumer = _RecordConsumer()
263
264 - def parse(self, handle):
265 self._scanner.feed(handle, self._consumer) 266 return self._consumer.data
267
268 -class SequenceParser(AbstractParser):
269 """Parses SwissProt data into a standard SeqRecord object. 270 """
271 - def __init__(self, alphabet = Alphabet.generic_protein):
272 """Initialize a SequenceParser. 273 274 Arguments: 275 o alphabet - The alphabet to use for the generated Seq objects. If 276 not supplied this will default to the generic protein alphabet. 277 """ 278 self._scanner = _Scanner() 279 self._consumer = _SequenceConsumer(alphabet)
280
281 - def parse(self, handle):
282 self._scanner.feed(handle, self._consumer) 283 return self._consumer.data
284
285 -class _Scanner:
286 """Scans SwissProt-formatted data. 287 288 Tested with: 289 Release 37 290 Release 38 291 """ 292
293 - def feed(self, handle, consumer):
294 """feed(self, handle, consumer) 295 296 Feed in SwissProt data for scanning. handle is a file-like 297 object that contains swissprot data. consumer is a 298 Consumer object that will receive events as the report is scanned. 299 300 """ 301 if isinstance(handle, File.UndoHandle): 302 uhandle = handle 303 else: 304 uhandle = File.UndoHandle(handle) 305 self._scan_record(uhandle, consumer)
306
307 - def _skip_starstar(self, uhandle) :
308 """Ignores any lines starting **""" 309 #See Bug 2353, some files from the EBI have extra lines 310 #starting "**" (two asterisks/stars), usually between the 311 #features and sequence but not all the time. They appear 312 #to be unofficial automated annotations. e.g. 313 #** 314 #** ################# INTERNAL SECTION ################## 315 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 316 while "**" == uhandle.peekline()[:2] : 317 skip = uhandle.readline()
318 #print "Skipping line: %s" % skip.rstrip() 319
320 - def _scan_record(self, uhandle, consumer):
321 consumer.start_record() 322 for fn in self._scan_fns: 323 self._skip_starstar(uhandle) 324 fn(self, uhandle, consumer) 325 326 # In Release 38, ID N33_HUMAN has a DR buried within comments. 327 # Check for this and do more comments, if necessary. 328 # XXX handle this better 329 if fn is self._scan_dr.im_func: 330 self._scan_cc(uhandle, consumer) 331 self._scan_dr(uhandle, consumer) 332 consumer.end_record()
333
334 - def _scan_line(self, line_type, uhandle, event_fn, 335 exactly_one=None, one_or_more=None, any_number=None, 336 up_to_one=None):
337 # Callers must set exactly one of exactly_one, one_or_more, or 338 # any_number to a true value. I do not explicitly check to 339 # make sure this function is called correctly. 340 341 # This does not guarantee any parameter safety, but I 342 # like the readability. The other strategy I tried was have 343 # parameters min_lines, max_lines. 344 345 if exactly_one or one_or_more: 346 read_and_call(uhandle, event_fn, start=line_type) 347 if one_or_more or any_number: 348 while 1: 349 if not attempt_read_and_call(uhandle, event_fn, 350 start=line_type): 351 break 352 if up_to_one: 353 attempt_read_and_call(uhandle, event_fn, start=line_type)
354
355 - def _scan_id(self, uhandle, consumer):
356 self._scan_line('ID', uhandle, consumer.identification, exactly_one=1)
357
358 - def _scan_ac(self, uhandle, consumer):
359 # Until release 38, this used to match exactly_one. 360 # However, in release 39, 1A02_HUMAN has 2 AC lines, and the 361 # definition needed to be expanded. 362 self._scan_line('AC', uhandle, consumer.accession, any_number=1)
363
364 - def _scan_dt(self, uhandle, consumer):
365 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 366 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 367 # IPI doesn't necessarily contain the third line about annotations 368 self._scan_line('DT', uhandle, consumer.date, up_to_one=1)
369
370 - def _scan_de(self, uhandle, consumer):
371 # IPI can be missing a DE line 372 self._scan_line('DE', uhandle, consumer.description, any_number=1)
373
374 - def _scan_gn(self, uhandle, consumer):
375 self._scan_line('GN', uhandle, consumer.gene_name, any_number=1)
376
377 - def _scan_os(self, uhandle, consumer):
378 self._scan_line('OS', uhandle, consumer.organism_species, 379 one_or_more=1)
380
381 - def _scan_og(self, uhandle, consumer):
382 self._scan_line('OG', uhandle, consumer.organelle, any_number=1)
383
384 - def _scan_oc(self, uhandle, consumer):
385 self._scan_line('OC', uhandle, consumer.organism_classification, 386 one_or_more=1)
387
388 - def _scan_ox(self, uhandle, consumer):
389 self._scan_line('OX', uhandle, consumer.taxonomy_id, 390 any_number=1)
391
392 - def _scan_oh(self, uhandle, consumer):
393 # viral host organism. introduced after SwissProt 39. 394 self._scan_line('OH', uhandle, consumer.organism_host, any_number=1)
395
396 - def _scan_reference(self, uhandle, consumer):
397 while True: 398 if safe_peekline(uhandle)[:2] != 'RN': 399 break 400 self._scan_rn(uhandle, consumer) 401 self._scan_rp(uhandle, consumer) 402 self._scan_rc(uhandle, consumer) 403 self._scan_rx(uhandle, consumer) 404 # ws:2001-12-05 added, for record with RL before RA 405 self._scan_rl(uhandle, consumer) 406 self._scan_ra(uhandle, consumer) 407 #EBI copy of P72010 is missing the RT line, and has one 408 #of their ** lines in its place noting "** /NO TITLE." 409 #See also bug 2353 410 self._skip_starstar(uhandle) 411 self._scan_rt(uhandle, consumer) 412 self._scan_rl(uhandle, consumer)
413
414 - def _scan_rn(self, uhandle, consumer):
415 self._scan_line('RN', uhandle, consumer.reference_number, 416 exactly_one=1)
417
418 - def _scan_rp(self, uhandle, consumer):
419 self._scan_line('RP', uhandle, consumer.reference_position, 420 one_or_more=1)
421
422 - def _scan_rc(self, uhandle, consumer):
423 self._scan_line('RC', uhandle, consumer.reference_comment, 424 any_number=1)
425
426 - def _scan_rx(self, uhandle, consumer):
427 self._scan_line('RX', uhandle, consumer.reference_cross_reference, 428 any_number=1)
429
430 - def _scan_ra(self, uhandle, consumer):
431 # In UniProt release 1.12 of 6/21/04, there is a new RG 432 # (Reference Group) line, which references a group instead of 433 # an author. Each block must have at least 1 RA or RG line. 434 self._scan_line('RA', uhandle, consumer.reference_author, 435 any_number=1) 436 self._scan_line('RG', uhandle, consumer.reference_author, 437 any_number=1) 438 # PRKN_HUMAN has RG lines, then RA lines. The best solution 439 # is to write code that accepts either of the line types. 440 # This is the quick solution... 441 self._scan_line('RA', uhandle, consumer.reference_author, 442 any_number=1)
443
444 - def _scan_rt(self, uhandle, consumer):
445 self._scan_line('RT', uhandle, consumer.reference_title, 446 any_number=1)
447
448 - def _scan_rl(self, uhandle, consumer):
449 # This was one_or_more, but P82909 in TrEMBL 16.0 does not 450 # have one. 451 self._scan_line('RL', uhandle, consumer.reference_location, 452 any_number=1)
453
454 - def _scan_cc(self, uhandle, consumer):
455 self._scan_line('CC', uhandle, consumer.comment, any_number=1)
456
457 - def _scan_dr(self, uhandle, consumer):
458 self._scan_line('DR', uhandle, consumer.database_cross_reference, 459 any_number=1)
460
461 - def _scan_kw(self, uhandle, consumer):
462 self._scan_line('KW', uhandle, consumer.keyword, any_number=1)
463
464 - def _scan_ft(self, uhandle, consumer):
465 self._scan_line('FT', uhandle, consumer.feature_table, any_number=1)
466
467 - def _scan_pe(self, uhandle, consumer):
468 self._scan_line('PE', uhandle, consumer.protein_existence, any_number=1)
469
470 - def _scan_sq(self, uhandle, consumer):
471 self._scan_line('SQ', uhandle, consumer.sequence_header, exactly_one=1)
472
473 - def _scan_sequence_data(self, uhandle, consumer):
474 self._scan_line(' ', uhandle, consumer.sequence_data, one_or_more=1)
475
476 - def _scan_terminator(self, uhandle, consumer):
477 self._scan_line('//', uhandle, consumer.terminator, exactly_one=1)
478 479 _scan_fns = [ 480 _scan_id, 481 _scan_ac, 482 _scan_dt, 483 _scan_de, 484 _scan_gn, 485 _scan_os, 486 _scan_og, 487 _scan_oc, 488 _scan_ox, 489 _scan_oh, 490 _scan_reference, 491 _scan_cc, 492 _scan_dr, 493 _scan_pe, 494 _scan_kw, 495 _scan_ft, 496 _scan_sq, 497 _scan_sequence_data, 498 _scan_terminator 499 ]
500
501 -class _RecordConsumer(AbstractConsumer):
502 """Consumer that converts a SwissProt record to a Record object. 503 504 Members: 505 data Record with SwissProt data. 506 507 """
508 - def __init__(self):
509 self.data = None
510
511 - def __repr__(self) :
512 return "Bio.SwissProt.SProt._RecordConsumer()"
513
514 - def start_record(self):
515 self.data = Record() 516 self._sequence_lines = []
517
518 - def end_record(self):
519 self._clean_record(self.data) 520 self.data.sequence = "".join(self._sequence_lines)
521
522 - def identification(self, line):
523 cols = line.split() 524 #Prior to release 51, included with MoleculeType: 525 #ID EntryName DataClass; MoleculeType; SequenceLength. 526 # 527 #Newer files lack the MoleculeType: 528 #ID EntryName DataClass; SequenceLength. 529 # 530 #Note that cols is split on white space, so the length 531 #should become two fields (number and units) 532 if len(cols) == 6 : 533 self.data.entry_name = cols[1] 534 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 535 self.data.molecule_type = cols[3].rstrip(_CHOMP) # don't want ';' 536 self.data.sequence_length = int(cols[4]) 537 elif len(cols) == 5 : 538 self.data.entry_name = cols[1] 539 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 540 self.data.molecule_type = None 541 self.data.sequence_length = int(cols[3]) 542 else : 543 #Should we print a warning an continue? 544 raise ValueError("ID line has unrecognised format:\n"+line) 545 546 # data class can be 'STANDARD' or 'PRELIMINARY' 547 # ws:2001-12-05 added IPI 548 # pjc:2006-11-02 added 'Reviewed' and 'Unreviewed' 549 if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI', 550 'Reviewed', 'Unreviewed']: 551 raise ValueError("Unrecognized data class %s in line\n%s" % \ 552 (self.data.data_class, line)) 553 # molecule_type should be 'PRT' for PRoTein 554 # Note that has been removed in recent releases (set to None) 555 if self.data.molecule_type is not None \ 556 and self.data.molecule_type != 'PRT': 557 raise ValueError("Unrecognized molecule type %s in line\n%s" % \ 558 (self.data.molecule_type, line))
559
560 - def accession(self, line):
561 cols = line[5:].rstrip(_CHOMP).strip().split(';') 562 for ac in cols: 563 if ac.strip() : 564 #remove any leading or trailing white space 565 self.data.accessions.append(ac.strip())
566
567 - def date(self, line):
568 uprline = line.upper() 569 cols = line.rstrip().split() 570 571 if uprline.find('CREATED') >= 0 \ 572 or uprline.find('LAST SEQUENCE UPDATE') >= 0 \ 573 or uprline.find('LAST ANNOTATION UPDATE') >= 0: 574 # Old style DT line 575 # ================= 576 # e.g. 577 # DT 01-FEB-1995 (Rel. 31, Created) 578 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 579 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 580 # 581 # or: 582 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 583 # ... 584 585 # find where the version information will be located 586 # This is needed for when you have cases like IPI where 587 # the release verison is in a different spot: 588 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 589 uprcols = uprline.split() 590 rel_index = -1 591 for index in range(len(uprcols)): 592 if uprcols[index].find("REL.") >= 0: 593 rel_index = index 594 assert rel_index >= 0, \ 595 "Could not find Rel. in DT line: %s" % (line) 596 version_index = rel_index + 1 597 # get the version information 598 str_version = cols[version_index].rstrip(_CHOMP) 599 # no version number 600 if str_version == '': 601 version = 0 602 # dot versioned 603 elif str_version.find(".") >= 0: 604 version = str_version 605 # integer versioned 606 else: 607 version = int(str_version) 608 609 if uprline.find('CREATED') >= 0: 610 self.data.created = cols[1], version 611 elif uprline.find('LAST SEQUENCE UPDATE') >= 0: 612 self.data.sequence_update = cols[1], version 613 elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0: 614 self.data.annotation_update = cols[1], version 615 else: 616 assert False, "Shouldn't reach this line!" 617 elif uprline.find('INTEGRATED INTO') >= 0 \ 618 or uprline.find('SEQUENCE VERSION') >= 0 \ 619 or uprline.find('ENTRY VERSION') >= 0: 620 # New style DT line 621 # ================= 622 # As of UniProt Knowledgebase release 7.0 (including 623 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 624 # format of the DT lines and the version information 625 # in them was changed - the release number was dropped. 626 # 627 # For more information see bug 1948 and 628 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 629 # 630 # e.g. 631 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 632 # DT 15-OCT-2001, sequence version 3. 633 # DT 01-APR-2004, entry version 14. 634 # 635 #This is a new style DT line... 636 637 # The date should be in string cols[1] 638 # Get the version number if there is one. 639 # For the three DT lines above: 0, 3, 14 640 try: 641 version = int(cols[-1]) 642 except ValueError : 643 version = 0 644 645 # Re-use the historical property names, even though 646 # the meaning has changed slighty: 647 if uprline.find("INTEGRATED") >= 0: 648 self.data.created = cols[1], version 649 elif uprline.find('SEQUENCE VERSION') >= 0: 650 self.data.sequence_update = cols[1], version 651 elif uprline.find( 'ENTRY VERSION') >= 0: 652 self.data.annotation_update = cols[1], version 653 else: 654 assert False, "Shouldn't reach this line!" 655 else: 656 raise ValueError("I don't understand the date line %s" % line)
657
658 - def description(self, line):
659 self.data.description += line[5:]
660
661 - def gene_name(self, line):
662 self.data.gene_name += line[5:]
663
664 - def organism_species(self, line):
665 self.data.organism += line[5:]
666
667 - def organelle(self, line):
668 self.data.organelle += line[5:]
669
670 - def organism_classification(self, line):
671 line = line[5:].rstrip(_CHOMP) 672 cols = line.split(';') 673 for col in cols: 674 self.data.organism_classification.append(col.lstrip())
675
676 - def taxonomy_id(self, line):
677 # The OX line is in the format: 678 # OX DESCRIPTION=ID[, ID]...; 679 # If there are too many id's to fit onto a line, then the ID's 680 # continue directly onto the next line, e.g. 681 # OX DESCRIPTION=ID[, ID]... 682 # OX ID[, ID]...; 683 # Currently, the description is always "NCBI_TaxID". 684 685 # To parse this, I need to check to see whether I'm at the 686 # first line. If I am, grab the description and make sure 687 # it's an NCBI ID. Then, grab all the id's. 688 line = line[5:].rstrip(_CHOMP) 689 index = line.find('=') 690 if index >= 0: 691 descr = line[:index] 692 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 693 ids = line[index+1:].split(',') 694 else: 695 ids = line.split(',') 696 self.data.taxonomy_id.extend([id.strip() for id in ids])
697
698 - def organism_host(self, line):
699 # Line type OH (Organism Host) for viral hosts 700 # same code as in taxonomy_id() 701 line = line[5:].rstrip(_CHOMP) 702 index = line.find('=') 703 if index >= 0: 704 descr = line[:index] 705 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 706 ids = line[index+1:].split(',') 707 else: 708 ids = line.split(',') 709 self.data.host_organism.extend([id.strip() for id in ids])
710
711 - def reference_number(self, line):
712 rn = line[5:].rstrip() 713 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 714 ref = Reference() 715 ref.number = int(rn[1:-1]) 716 self.data.references.append(ref)
717
718 - def reference_position(self, line):
719 assert self.data.references, "RP: missing RN" 720 self.data.references[-1].positions.append(line[5:].rstrip())
721
722 - def reference_comment(self, line):
723 assert self.data.references, "RC: missing RN" 724 cols = line[5:].rstrip().split( ';') 725 ref = self.data.references[-1] 726 for col in cols: 727 if not col: # last column will be the empty string 728 continue 729 # The token is everything before the first '=' character. 730 index = col.find('=') 731 token, text = col[:index], col[index+1:] 732 # According to the spec, there should only be 1 '=' 733 # character. However, there are too many exceptions to 734 # handle, so we'll ease up and allow anything after the 735 # first '='. 736 #if col == ' STRAIN=TISSUE=BRAIN': 737 # # from CSP_MOUSE, release 38 738 # token, text = "TISSUE", "BRAIN" 739 #elif col == ' STRAIN=NCIB 9816-4, AND STRAIN=G7 / ATCC 17485': 740 # # from NDOA_PSEPU, release 38 741 # token, text = "STRAIN", "NCIB 9816-4 AND G7 / ATCC 17485" 742 #elif col == ' STRAIN=ISOLATE=NO 27, ANNO 1987' or \ 743 # col == ' STRAIN=ISOLATE=NO 27 / ANNO 1987': 744 # # from NU3M_BALPH, release 38, release 39 745 # token, text = "STRAIN", "ISOLATE NO 27, ANNO 1987" 746 #else: 747 # token, text = string.split(col, '=') 748 ref.comments.append((token.lstrip(), text))
749
750 - def reference_cross_reference(self, line):
751 assert self.data.references, "RX: missing RN" 752 # The basic (older?) RX line is of the form: 753 # RX MEDLINE; 85132727. 754 # but there are variants of this that need to be dealt with (see below) 755 756 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 757 # have extraneous information in the RX line. Check for 758 # this and chop it out of the line. 759 # (noticed by katel@worldpath.net) 760 ind = line.find('[NCBI, ExPASy, Israel, Japan]') 761 if ind >= 0: 762 line = line[:ind] 763 764 # RX lines can also be used of the form 765 # RX PubMed=9603189; 766 # reported by edvard@farmasi.uit.no 767 # and these can be more complicated like: 768 # RX MEDLINE=95385798; PubMed=7656980; 769 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 770 # We look for these cases first and deal with them 771 if line.find("=") != -1: 772 cols = line[2:].split("; ") 773 cols = [x.strip() for x in cols] 774 cols = [x for x in cols if x] 775 for col in cols: 776 x = col.split("=") 777 assert len(x) == 2, "I don't understand RX line %s" % line 778 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) 779 ref = self.data.references[-1].references 780 ref.append((key, value)) 781 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 782 else: 783 cols = line.split() 784 # normally we split into the three parts 785 assert len(cols) == 3, "I don't understand RX line %s" % line 786 self.data.references[-1].references.append( 787 (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP)))
788
789 - def reference_author(self, line):
790 assert self.data.references, "RA: missing RN" 791 ref = self.data.references[-1] 792 ref.authors += line[5:]
793
794 - def reference_title(self, line):
795 assert self.data.references, "RT: missing RN" 796 ref = self.data.references[-1] 797 ref.title += line[5:]
798
799 - def reference_location(self, line):
800 assert self.data.references, "RL: missing RN" 801 ref = self.data.references[-1] 802 ref.location += line[5:]
803
804 - def comment(self, line):
805 if line[5:8] == '-!-': # Make a new comment 806 self.data.comments.append(line[9:]) 807 elif line[5:8] == ' ': # add to the previous comment 808 if not self.data.comments: 809 # TCMO_STRGA in Release 37 has comment with no topic 810 self.data.comments.append(line[9:]) 811 else: 812 self.data.comments[-1] += line[9:] 813 elif line[5:8] == '---': 814 # If there are no comments, and it's not the closing line, 815 # make a new comment. 816 if not self.data.comments or self.data.comments[-1][:3] != '---': 817 self.data.comments.append(line[5:]) 818 else: 819 self.data.comments[-1] += line[5:] 820 else: # copyright notice 821 self.data.comments[-1] += line[5:]
822
823 - def database_cross_reference(self, line):
824 # From CLD1_HUMAN, Release 39: 825 # DR EMBL; [snip]; -. [EMBL / GenBank / DDBJ] [CoDingSequence] 826 # DR PRODOM [Domain structure / List of seq. sharing at least 1 domai 827 # DR SWISS-2DPAGE; GET REGION ON 2D PAGE. 828 line = line[5:] 829 # Remove the comments at the end of the line 830 i = line.find('[') 831 if i >= 0: 832 line = line[:i] 833 cols = line.rstrip(_CHOMP).split(';') 834 cols = [col.lstrip() for col in cols] 835 self.data.cross_references.append(tuple(cols))
836
837 - def keyword(self, line):
838 cols = line[5:].rstrip(_CHOMP).split(';') 839 self.data.keywords.extend([c.lstrip() for c in cols])
840
841 - def feature_table(self, line):
842 line = line[5:] # get rid of junk in front 843 name = line[0:8].rstrip() 844 try: 845 from_res = int(line[9:15]) 846 except ValueError: 847 from_res = line[9:15].lstrip() 848 try: 849 to_res = int(line[16:22]) 850 except ValueError: 851 to_res = line[16:22].lstrip() 852 description = line[29:70].rstrip() 853 #if there is a feature_id (FTId), store it away 854 if line[29:35]==r"/FTId=": 855 ft_id = line[35:70].rstrip()[:-1] 856 else: 857 ft_id ="" 858 if not name: # is continuation of last one 859 assert not from_res and not to_res 860 name, from_res, to_res, old_description,old_ft_id = self.data.features[-1] 861 del self.data.features[-1] 862 description = "%s %s" % (old_description, description) 863 864 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 865 if name == "VARSPLIC": 866 description = self._fix_varsplic_sequences(description) 867 self.data.features.append((name, from_res, to_res, description,ft_id))
868
869 - def _fix_varsplic_sequences(self, description):
870 """Remove unwanted spaces in sequences. 871 872 During line carryover, the sequences in VARSPLIC can get mangled 873 with unwanted spaces like: 874 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 875 We want to check for this case and correct it as it happens. 876 """ 877 descr_cols = description.split(" -> ") 878 if len(descr_cols) == 2: 879 first_seq = descr_cols[0] 880 second_seq = descr_cols[1] 881 extra_info = '' 882 # we might have more information at the end of the 883 # second sequence, which should be in parenthesis 884 extra_info_pos = second_seq.find(" (") 885 if extra_info_pos != -1: 886 extra_info = second_seq[extra_info_pos:] 887 second_seq = second_seq[:extra_info_pos] 888 889 # now clean spaces out of the first and second string 890 first_seq = first_seq.replace(" ", "") 891 second_seq = second_seq.replace(" ", "") 892 893 # reassemble the description 894 description = first_seq + " -> " + second_seq + extra_info 895 896 return description
897
898 - def protein_existence(self, line):
899 #TODO - Record this information? 900 pass
901
902 - def sequence_header(self, line):
903 cols = line.split() 904 assert len(cols) == 8, "I don't understand SQ line %s" % line 905 # Do more checking here? 906 self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6]
907
908 - def sequence_data(self, line):
909 #It should be faster to make a list of strings, and join them at the end. 910 self._sequence_lines.append(line.replace(" ", "").rstrip())
911
912 - def terminator(self, line):
913 pass
914 915 #def _clean(self, line, rstrip=1): 916 # if rstrip: 917 # return string.rstrip(line[5:]) 918 # return line[5:] 919
920 - def _clean_record(self, rec):
921 # Remove trailing newlines 922 members = ['description', 'gene_name', 'organism', 'organelle'] 923 for m in members: 924 attr = getattr(rec, m) 925 setattr(rec, m, attr.rstrip()) 926 for ref in rec.references: 927 self._clean_references(ref)
928
929 - def _clean_references(self, ref):
930 # Remove trailing newlines 931 members = ['authors', 'title', 'location'] 932 for m in members: 933 attr = getattr(ref, m) 934 setattr(ref, m, attr.rstrip())
935
936 -class _SequenceConsumer(AbstractConsumer):
937 """Consumer that converts a SwissProt record to a SeqRecord object. 938 939 Members: 940 data Record with SwissProt data. 941 alphabet The alphabet the generated Seq objects will have. 942 """ 943 #TODO - Cope with references as done for GenBank
944 - def __init__(self, alphabet = Alphabet.generic_protein):
945 """Initialize a Sequence Consumer 946 947 Arguments: 948 o alphabet - The alphabet to use for the generated Seq objects. If 949 not supplied this will default to the generic protein alphabet. 950 """ 951 self.data = None 952 self.alphabet = alphabet
953
954 - def start_record(self):
955 seq = Seq.Seq("", self.alphabet) 956 self.data = SeqRecord.SeqRecord(seq) 957 self.data.description = "" 958 self.data.name = "" 959 self._current_ref = None 960 self._sequence_lines = []
961
962 - def end_record(self):
963 if self._current_ref is not None: 964 self.data.annotations['references'].append(self._current_ref) 965 self._current_ref = None 966 self.data.description = self.data.description.rstrip() 967 self.data.seq = Seq.Seq("".join(self._sequence_lines), self.alphabet)
968
969 - def identification(self, line):
970 cols = line.split() 971 self.data.name = cols[1]
972
973 - def accession(self, line):
974 #Note that files can and often do contain multiple AC lines. 975 ids = line[5:].strip().split(';') 976 #Remove any white space 977 ids = [x.strip() for x in ids if x.strip()] 978 979 #Use the first as the ID, but record them ALL in the annotations 980 try : 981 self.data.annotations['accessions'].extend(ids) 982 except KeyError : 983 self.data.annotations['accessions'] = ids 984 985 #Use the FIRST accession as the ID, not the first on this line! 986 self.data.id = self.data.annotations['accessions'][0]
987 #self.data.id = ids[0] 988
989 - def description(self, line):
990 self.data.description = self.data.description + \ 991 line[5:].strip() + "\n"
992
993 - def sequence_data(self, line):
994 #It should be faster to make a list of strings, and join them at the end. 995 self._sequence_lines.append(line.replace(" ", "").rstrip())
996
997 - def gene_name(self, line):
998 #We already store the identification/accession as the records name/id 999 try : 1000 self.data.annotations['gene_name'] += line[5:] 1001 except KeyError : 1002 self.data.annotations['gene_name'] = line[5:]
1003
1004 - def comment(self, line):
1005 #Try and agree with SeqRecord convention from the GenBank parser, 1006 #which stores the comments as a long string with newlines 1007 #with key 'comment' 1008 try : 1009 self.data.annotations['comment'] += "\n" + line[5:] 1010 except KeyError : 1011 self.data.annotations['comment'] = line[5:]
1012 #TODO - Follow SwissProt conventions more closely? 1013
1014 - def database_cross_reference(self, line):
1015 #Format of the line is described in the manual dated 04-Dec-2007 as: 1016 #DR DATABASE; PRIMARY; SECONDARY[; TERTIARY][; QUATERNARY]. 1017 #However, some older files only seem to have a single identifier: 1018 #DR DATABASE; PRIMARY. 1019 # 1020 #Also must cope with things like this from Tests/SwissProt/sp007, 1021 #DR PRODOM [Domain structure / List of seq. sharing at least 1 domain] 1022 # 1023 #Store these in the dbxref list, but for consistency with 1024 #the GenBank parser and with what BioSQL can cope with, 1025 #store only DATABASE_IDENTIFIER:PRIMARY_IDENTIFIER 1026 parts = [x.strip() for x in line[5:].strip(_CHOMP).split(";")] 1027 if len(parts) > 1 : 1028 value = "%s:%s" % (parts[0], parts[1]) 1029 #Avoid duplicate entries 1030 if value not in self.data.dbxrefs : 1031 self.data.dbxrefs.append(value)
1032 #else : 1033 #print "Bad DR line:\n%s" % line 1034 1035
1036 - def date(self, line):
1037 date_str = line.split()[0] 1038 uprline = line.upper() 1039 if uprline.find('CREATED') >= 0 : 1040 #Try and agree with SeqRecord convention from the GenBank parser, 1041 #which stores the submitted date as 'date' 1042 self.data.annotations['date'] = date_str 1043 elif uprline.find('LAST SEQUENCE UPDATE') >= 0 : 1044 #There is no existing convention from the GenBank SeqRecord parser 1045 self.data.annotations['date_last_sequence_update'] = date_str 1046 elif uprline.find('LAST ANNOTATION UPDATE') >= 0: 1047 #There is no existing convention from the GenBank SeqRecord parser 1048 self.data.annotations['date_last_annotation_update'] = date_str
1049
1050 - def keyword(self, line):
1051 #Try and agree with SeqRecord convention from the GenBank parser, 1052 #which stores a list as 'keywords' 1053 cols = line[5:].rstrip(_CHOMP).split(';') 1054 cols = [c.strip() for c in cols] 1055 cols = filter(None, cols) 1056 try : 1057 #Extend any existing list of keywords 1058 self.data.annotations['keywords'].extend(cols) 1059 except KeyError : 1060 #Create the list of keywords 1061 self.data.annotations['keywords'] = cols
1062
1063 - def organism_species(self, line):
1064 #Try and agree with SeqRecord convention from the GenBank parser, 1065 #which stores the organism as a string with key 'organism' 1066 data = line[5:].rstrip(_CHOMP) 1067 try : 1068 #Append to any existing data split over multiple lines 1069 self.data.annotations['organism'] += " " + data 1070 except KeyError: 1071 self.data.annotations['organism'] = data
1072
1073 - def organism_host(self, line):
1074 #There is no SeqRecord convention from the GenBank parser, 1075 data = line[5:].rstrip(_CHOMP) 1076 index = data.find('=') 1077 if index >= 0: 1078 descr = data[:index] 1079 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1080 ids = data[index+1:].split(',') 1081 else: 1082 ids = data.split(',') 1083 1084 try : 1085 #Append to any existing data 1086 self.data.annotations['organism_host'].extend(ids) 1087 except KeyError: 1088 self.data.annotations['organism_host'] = ids
1089
1090 - def organism_classification(self, line):
1091 #Try and agree with SeqRecord convention from the GenBank parser, 1092 #which stores this taxonomy lineage ese as a list of strings with 1093 #key 'taxonomy'. 1094 #Note that 'ncbi_taxid' is used for the taxonomy ID (line OX) 1095 line = line[5:].rstrip(_CHOMP) 1096 cols = [col.strip() for col in line.split(';')] 1097 try : 1098 #Append to any existing data 1099 self.data.annotations['taxonomy'].extend(cols) 1100 except KeyError: 1101 self.data.annotations['taxonomy'] = cols
1102
1103 - def taxonomy_id(self, line):
1104 #Try and agree with SeqRecord convention expected in BioSQL 1105 #the NCBI taxon id with key 'ncbi_taxid'. 1106 #Note that 'taxonomy' is used for the taxonomy lineage 1107 #(held as a list of strings, line type OC) 1108 1109 line = line[5:].rstrip(_CHOMP) 1110 index = line.find('=') 1111 if index >= 0: 1112 descr = line[:index] 1113 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1114 ids = line[index+1:].split(',') 1115 else: 1116 ids = line.split(',') 1117 1118 try : 1119 #Append to any existing data 1120 self.data.annotations['ncbi_taxid'].extend(ids) 1121 except KeyError: 1122 self.data.annotations['ncbi_taxid'] = ids
1123
1124 - def reference_number(self, line):
1125 """RN line, reference number (start of new reference).""" 1126 from Bio.SeqFeature import Reference 1127 # if we have a current reference that hasn't been added to 1128 # the list of references, add it. 1129 if self._current_ref is not None: 1130 self.data.annotations['references'].append(self._current_ref) 1131 else: 1132 self.data.annotations['references'] = [] 1133 1134 self._current_ref = Reference()
1135
1136 - def reference_position(self, line):
1137 """RP line, reference position.""" 1138 assert self._current_ref is not None, "RP: missing RN" 1139 #Should try and store this in self._current_ref.location 1140 #but the SwissProt locations don't match easily to the 1141 #format used in GenBank... 1142 pass
1143
1144 - def reference_cross_reference(self, line):
1145 """RX line, reference cross-references.""" 1146 assert self._current_ref is not None, "RX: missing RN" 1147 # The basic (older?) RX line is of the form: 1148 # RX MEDLINE; 85132727. 1149 # or more recently: 1150 # RX MEDLINE=95385798; PubMed=7656980; 1151 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 1152 # We look for these cases first and deal with them 1153 if line.find("=") != -1: 1154 cols = line[2:].split("; ") 1155 cols = [x.strip() for x in cols] 1156 cols = [x for x in cols if x] 1157 for col in cols: 1158 x = col.split("=") 1159 assert len(x) == 2, "I don't understand RX line %s" % line 1160 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) 1161 if key == "MEDLINE" : 1162 self._current_ref.medline_id = value 1163 elif key == "PubMed" : 1164 self._current_ref.pubmed_id = value 1165 else : 1166 #Sadly the SeqFeature.Reference object doesn't 1167 #support anything else (yet) 1168 pass 1169 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 1170 else: 1171 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 1172 # have extraneous information in the RX line. Check for 1173 # this and chop it out of the line. 1174 # (noticed by katel@worldpath.net) 1175 ind = line.find('[NCBI, ExPASy, Israel, Japan]') 1176 if ind >= 0: 1177 line = line[:ind] 1178 cols = line.split() 1179 # normally we split into the three parts 1180 assert len(cols) == 3, "I don't understand RX line %s" % line 1181 key = cols[1].rstrip(_CHOMP) 1182 value = cols[2].rstrip(_CHOMP) 1183 if key == "MEDLINE" : 1184 self._current_ref.medline_id = value 1185 elif key == "PubMed" : 1186 self._current_ref.pubmed_id = value 1187 else : 1188 #Sadly the SeqFeature.Reference object doesn't 1189 #support anything else (yet) 1190 pass
1191
1192 - def reference_author(self, line):
1193 """RA line, reference author(s).""" 1194 assert self._current_ref is not None, "RA: missing RN" 1195 self._current_ref.authors += line[5:].rstrip("\n")
1196
1197 - def reference_title(self, line):
1198 """RT line, reference title.""" 1199 assert self._current_ref is not None, "RT: missing RN" 1200 self._current_ref.title += line[5:].rstrip("\n")
1201
1202 - def reference_location(self, line):
1203 """RL line, reference 'location' - journal, volume, pages, year.""" 1204 assert self._current_ref is not None, "RL: missing RN" 1205 self._current_ref.journal += line[5:].rstrip("\n")
1206
1207 - def reference_comment(self, line):
1208 """RC line, reference comment.""" 1209 assert self._current_ref is not None, "RC: missing RN" 1210 #This has a key=value; structure... 1211 #Can we do a better job with the current Reference class? 1212 self._current_ref.comment += line[5:].rstrip("\n")
1213
1214 -def index_file(filename, indexname, rec2key=None):
1215 """index_file(filename, indexname, rec2key=None) 1216 1217 Index a SwissProt file. filename is the name of the file. 1218 indexname is the name of the dictionary. rec2key is an 1219 optional callback that takes a Record and generates a unique key 1220 (e.g. the accession number) for the record. If not specified, 1221 the entry name will be used. 1222 1223 """ 1224 from Bio.SwissProt import parse 1225 if not os.path.exists(filename): 1226 raise ValueError("%s does not exist" % filename) 1227 1228 index = Index.Index(indexname, truncate=1) 1229 index[Dictionary._Dictionary__filename_key] = filename 1230 1231 handle = open(filename) 1232 records = parse(handle) 1233 end = 0L 1234 for record in records: 1235 start = end 1236 end = long(handle.tell()) 1237 length = end - start 1238 1239 if rec2key is not None: 1240 key = rec2key(record) 1241 else: 1242 key = record.entry_name 1243 1244 if not key: 1245 raise KeyError("empty sequence key was produced") 1246 elif key in index: 1247 raise KeyError("duplicate key %s found" % key) 1248 1249 index[key] = start, length
1250