Package Bio :: Package SwissProt
[hide private]
[frames] | no frames]

Source Code for Package Bio.SwissProt

  1  # Copyright 2007 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  This module provides code to work with the sprotXX.dat file from 
  7  SwissProt. 
  8  http://www.expasy.ch/sprot/sprot-top.html 
  9   
 10  Tested with: 
 11  Release 56.9, 03-March-2009. 
 12   
 13   
 14  Classes: 
 15  Record             Holds SwissProt data. 
 16  Reference          Holds reference data from a SwissProt record. 
 17   
 18  Functions: 
 19  read               Read one SwissProt record 
 20  parse              Read multiple SwissProt records 
 21   
 22  """ 
 23   
 24   
25 -class Record:
26 """Holds information from a SwissProt record. 27 28 Members: 29 entry_name Name of this entry, e.g. RL1_ECOLI. 30 data_class Either 'STANDARD' or 'PRELIMINARY'. 31 molecule_type Type of molecule, 'PRT', 32 sequence_length Number of residues. 33 34 accessions List of the accession numbers, e.g. ['P00321'] 35 created A tuple of (date, release). 36 sequence_update A tuple of (date, release). 37 annotation_update A tuple of (date, release). 38 39 description Free-format description. 40 gene_name Gene name. See userman.txt for description. 41 organism The source of the sequence. 42 organelle The origin of the sequence. 43 organism_classification The taxonomy classification. List of strings. 44 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 45 taxonomy_id A list of NCBI taxonomy id's. 46 host_organism A list of NCBI taxonomy id's of the hosts of a virus, 47 if any. 48 references List of Reference objects. 49 comments List of strings. 50 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 51 keywords List of the keywords. 52 features List of tuples (key name, from, to, description). 53 from and to can be either integers for the residue 54 numbers, '<', '>', or '?' 55 56 seqinfo tuple of (length, molecular weight, CRC32 value) 57 sequence The sequence. 58 59 """
60 - def __init__(self):
61 self.entry_name = None 62 self.data_class = None 63 self.molecule_type = None 64 self.sequence_length = None 65 66 self.accessions = [] 67 self.created = None 68 self.sequence_update = None 69 self.annotation_update = None 70 71 self.description = '' 72 self.gene_name = '' 73 self.organism = '' 74 self.organelle = '' 75 self.organism_classification = [] 76 self.taxonomy_id = [] 77 self.host_organism = [] 78 self.references = [] 79 self.comments = [] 80 self.cross_references = [] 81 self.keywords = [] 82 self.features = [] 83 84 self.seqinfo = None 85 self.sequence = ''
86 87
88 -class Reference:
89 """Holds information from one reference in a SwissProt entry. 90 91 Members: 92 number Number of reference in an entry. 93 positions Describes extent of work. list of strings. 94 comments Comments. List of (token, text). 95 references References. List of (dbname, identifier) 96 authors The authors of the work. 97 title Title of the work. 98 location A citation for the work. 99 100 """
101 - def __init__(self):
102 self.number = None 103 self.positions = [] 104 self.comments = [] 105 self.references = [] 106 self.authors = '' 107 self.title = '' 108 self.location = ''
109 110
111 -def parse(handle):
112 while True: 113 record = _read(handle) 114 if not record: 115 return 116 yield record
117 118
119 -def read(handle):
120 record = _read(handle) 121 if not record: 122 raise ValueError("No SwissProt record found") 123 # We should have reached the end of the record by now 124 remainder = handle.read() 125 if remainder: 126 raise ValueError("More than one SwissProt record found") 127 return record
128 129 130 # Everything below is considered private 131 132
133 -def _read(handle):
134 record = None 135 for line in handle: 136 key, value = line[:2], line[5:].rstrip() 137 if key=='**': 138 #See Bug 2353, some files from the EBI have extra lines 139 #starting "**" (two asterisks/stars). They appear 140 #to be unofficial automated annotations. e.g. 141 #** 142 #** ################# INTERNAL SECTION ################## 143 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 144 pass 145 elif key=='ID': 146 record = Record() 147 _read_id(record, line) 148 _sequence_lines = [] 149 elif key=='AC': 150 accessions = [word for word in value.rstrip(";").split("; ")] 151 record.accessions.extend(accessions) 152 elif key=='DT': 153 _read_dt(record, line) 154 elif key=='DE': 155 record.description += line[5:] 156 elif key=='GN': 157 record.gene_name += line[5:] 158 elif key=='OS': 159 record.organism += line[5:] 160 elif key=='OG': 161 record.organelle += line[5:] 162 elif key=='OC': 163 cols = [col for col in value.rstrip(";.").split("; ")] 164 record.organism_classification.extend(cols) 165 elif key=='OX': 166 _read_ox(record, line) 167 elif key=='OH': 168 _read_oh(record, line) 169 elif key=='RN': 170 reference = Reference() 171 _read_rn(reference, value) 172 record.references.append(reference) 173 elif key=='RP': 174 assert record.references, "RP: missing RN" 175 record.references[-1].positions.append(value) 176 elif key=='RC': 177 assert record.references, "RC: missing RN" 178 reference = record.references[-1] 179 _read_rc(reference, value) 180 elif key=='RX': 181 assert record.references, "RX: missing RN" 182 reference = record.references[-1] 183 _read_rx(reference, value) 184 elif key=='RL': 185 assert record.references, "RL: missing RN" 186 reference = record.references[-1] 187 reference.location += line[5:] 188 # In UniProt release 1.12 of 6/21/04, there is a new RG 189 # (Reference Group) line, which references a group instead of 190 # an author. Each block must have at least 1 RA or RG line. 191 elif key=='RA': 192 assert record.references, "RA: missing RN" 193 reference = record.references[-1] 194 reference.authors += line[5:] 195 elif key=='RG': 196 assert record.references, "RG: missing RN" 197 reference = record.references[-1] 198 reference.authors += line[5:] 199 elif key=="RT": 200 assert record.references, "RT: missing RN" 201 reference = record.references[-1] 202 reference.title += line[5:] 203 elif key=='CC': 204 _read_cc(record, line) 205 elif key=='DR': 206 _read_dr(record, value) 207 elif key=='PE': 208 #TODO - Record this information? 209 pass 210 elif key=='KW': 211 cols = value.rstrip(";.").split('; ') 212 record.keywords.extend(cols) 213 elif key=='FT': 214 _read_ft(record, line) 215 elif key=='SQ': 216 cols = value.split() 217 assert len(cols) == 7, "I don't understand SQ line %s" % line 218 # Do more checking here? 219 record.seqinfo = int(cols[1]), int(cols[3]), cols[5] 220 elif key==' ': 221 _sequence_lines.append(value.replace(" ", "").rstrip()) 222 elif key=='//': 223 # Remove trailing newlines 224 record.description = record.description.rstrip() 225 record.gene_name = record.gene_name.rstrip() 226 record.organism = record.organism.rstrip() 227 record.organelle = record.organelle.rstrip() 228 for reference in record.references: 229 # Remove trailing newlines 230 reference.authors = reference.authors.rstrip() 231 reference.title = reference.title.rstrip() 232 reference.location = reference.location.rstrip() 233 record.sequence = "".join(_sequence_lines) 234 return record 235 else: 236 raise ValueError("Unknown keyword %s found" % keyword) 237 if record: 238 raise ValueError("Unexpected end of stream.")
239 240
241 -def _read_id(record, line):
242 cols = line[5:].split() 243 #Prior to release 51, included with MoleculeType: 244 #ID EntryName DataClass; MoleculeType; SequenceLength AA. 245 # 246 #Newer files lack the MoleculeType: 247 #ID EntryName DataClass; SequenceLength AA. 248 if len(cols) == 5 : 249 record.entry_name = cols[0] 250 record.data_class = cols[1].rstrip(";") 251 record.molecule_type = cols[2].rstrip(";") 252 record.sequence_length = int(cols[3]) 253 elif len(cols) == 4 : 254 record.entry_name = cols[0] 255 record.data_class = cols[1].rstrip(";") 256 record.molecule_type = None 257 record.sequence_length = int(cols[2]) 258 else : 259 raise ValueError("ID line has unrecognised format:\n"+line) 260 # check if the data class is one of the allowed values 261 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed') 262 if record.data_class not in allowed: 263 raise ValueError("Unrecognized data class %s in line\n%s" % \ 264 (record.data_class, line)) 265 # molecule_type should be 'PRT' for PRoTein 266 # Note that has been removed in recent releases (set to None) 267 if record.molecule_type not in (None, 'PRT'): 268 raise ValueError("Unrecognized molecule type %s in line\n%s" % \ 269 (record.molecule_type, line))
270 271
272 -def _read_dt(record, line):
273 value = line[5:] 274 uprline = value.upper() 275 cols = value.rstrip().split() 276 if 'CREATED' in uprline \ 277 or 'LAST SEQUENCE UPDATE' in uprline \ 278 or 'LAST ANNOTATION UPDATE' in uprline: 279 # Old style DT line 280 # ================= 281 # e.g. 282 # DT 01-FEB-1995 (Rel. 31, Created) 283 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 284 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 285 # 286 # or: 287 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 288 # ... 289 290 # find where the version information will be located 291 # This is needed for when you have cases like IPI where 292 # the release verison is in a different spot: 293 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 294 uprcols = uprline.split() 295 rel_index = -1 296 for index in range(len(uprcols)): 297 if uprcols[index].find("REL.") >= 0: 298 rel_index = index 299 assert rel_index >= 0, \ 300 "Could not find Rel. in DT line: %s" % line 301 version_index = rel_index + 1 302 # get the version information 303 str_version = cols[version_index].rstrip(",") 304 # no version number 305 if str_version == '': 306 version = 0 307 # dot versioned 308 elif str_version.find(".") >= 0: 309 version = str_version 310 # integer versioned 311 else: 312 version = int(str_version) 313 314 if 'CREATED' in uprline: 315 record.created = cols[1], version 316 elif 'LAST SEQUENCE UPDATE' in uprline: 317 record.sequence_update = cols[1], version 318 elif 'LAST ANNOTATION UPDATE' in uprline: 319 record.annotation_update = cols[1], version 320 else: 321 assert False, "Shouldn't reach this line!" 322 elif 'INTEGRATED INTO' in uprline \ 323 or 'SEQUENCE VERSION' in uprline \ 324 or 'ENTRY VERSION' in uprline: 325 # New style DT line 326 # ================= 327 # As of UniProt Knowledgebase release 7.0 (including 328 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 329 # format of the DT lines and the version information 330 # in them was changed - the release number was dropped. 331 # 332 # For more information see bug 1948 and 333 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 334 # 335 # e.g. 336 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 337 # DT 15-OCT-2001, sequence version 3. 338 # DT 01-APR-2004, entry version 14. 339 # 340 #This is a new style DT line... 341 342 # The date should be in string cols[1] 343 # Get the version number if there is one. 344 # For the three DT lines above: 0, 3, 14 345 try: 346 version = int(cols[-1]) 347 except ValueError : 348 version = 0 349 350 # Re-use the historical property names, even though 351 # the meaning has changed slighty: 352 if "INTEGRATED" in uprline: 353 record.created = cols[1], version 354 elif 'SEQUENCE VERSION' in uprline: 355 record.sequence_update = cols[1], version 356 elif 'ENTRY VERSION' in uprline: 357 record.annotation_update = cols[1], version 358 else: 359 assert False, "Shouldn't reach this line!" 360 else: 361 raise ValueError("I don't understand the date line %s" % line)
362 363
364 -def _read_ox(record, line):
365 # The OX line is in the format: 366 # OX DESCRIPTION=ID[, ID]...; 367 # If there are too many id's to fit onto a line, then the ID's 368 # continue directly onto the next line, e.g. 369 # OX DESCRIPTION=ID[, ID]... 370 # OX ID[, ID]...; 371 # Currently, the description is always "NCBI_TaxID". 372 # To parse this, I need to check to see whether I'm at the 373 # first line. If I am, grab the description and make sure 374 # it's an NCBI ID. Then, grab all the id's. 375 if record.taxonomy_id: 376 ids = line[5:].rstrip().rstrip(";") 377 else: 378 descr, ids = line[5:].rstrip().rstrip(";").split("=") 379 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 380 record.taxonomy_id.extend(ids.split(', '))
381 382
383 -def _read_oh(record, line):
384 # Line type OH (Organism Host) for viral hosts 385 # same code as in taxonomy_id() 386 if record.host_organism: 387 ids = line[5:].rstrip().rstrip(";") 388 else: 389 descr, ids = line[5:].rstrip().rstrip(";").split("=") 390 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 391 record.host_organism.extend(ids.split(', '))
392 393
394 -def _read_rn(reference, rn):
395 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 396 reference.number = int(rn[1:-1])
397 398
399 -def _read_rc(reference, value):
400 cols = value.split(';') 401 for col in cols: 402 if not col: # last column will be the empty string 403 return 404 # The token is everything before the first '=' character. 405 i = col.find("=") 406 if i>=0: 407 token, text = col[:i], col[i+1:] 408 comment = token.lstrip(), text 409 reference.comments.append(comment) 410 else: 411 comment = reference.comments[-1] 412 comment = "%s %s" % (comment, col) 413 reference.comments[-1] = comment
414 415
416 -def _read_rx(reference, value):
417 # The basic (older?) RX line is of the form: 418 # RX MEDLINE; 85132727. 419 # but there are variants of this that need to be dealt with (see below) 420 421 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 422 # have extraneous information in the RX line. Check for 423 # this and chop it out of the line. 424 # (noticed by katel@worldpath.net) 425 value = value.replace(' [NCBI, ExPASy, Israel, Japan]','') 426 427 # RX lines can also be used of the form 428 # RX PubMed=9603189; 429 # reported by edvard@farmasi.uit.no 430 # and these can be more complicated like: 431 # RX MEDLINE=95385798; PubMed=7656980; 432 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 433 # We look for these cases first and deal with them 434 if "=" in value: 435 cols = value.split("; ") 436 cols = [x.strip() for x in cols] 437 cols = [x for x in cols if x] 438 for col in cols: 439 x = col.split("=") 440 assert len(x) == 2, "I don't understand RX line %s" % line 441 key, value = x[0], x[1].rstrip(";") 442 reference.references.append((key, value)) 443 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 444 else: 445 cols = value.split("; ") 446 # normally we split into the three parts 447 assert len(cols) == 2, "I don't understand RX line %s" % line 448 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
449 450
451 -def _read_cc(record, line):
452 key, value = line[5:8], line[8:] 453 if key=='-!-': # Make a new comment 454 record.comments.append(value) 455 elif key==' ': # add to the previous comment 456 if not record.comments: 457 # TCMO_STRGA in Release 37 has comment with no topic 458 record.comments.append(value) 459 else: 460 record.comments[-1] += value 461 elif key=='---': 462 # If there are no comments, and it's not the closing line, 463 # make a new comment. 464 if not record.comments or record.comments[-1][:3] != '---': 465 record.comments.append(line[5:]) 466 else: 467 record.comments[-1] += line[5:] 468 else: # copyright notice 469 record.comments[-1] += line[5:]
470 471
472 -def _read_dr(record, value):
473 # Remove the comments at the end of the line 474 i = value.find(' [') 475 if i >= 0: 476 value = value[:i] 477 cols = value.rstrip(".").split('; ') 478 record.cross_references.append(tuple(cols))
479 480
481 -def _read_ft(record, line):
482 line = line[5:] # get rid of junk in front 483 name = line[0:8].rstrip() 484 try: 485 from_res = int(line[9:15]) 486 except ValueError: 487 from_res = line[9:15].lstrip() 488 try: 489 to_res = int(line[16:22]) 490 except ValueError: 491 to_res = line[16:22].lstrip() 492 description = line[29:70].rstrip() 493 #if there is a feature_id (FTId), store it away 494 if line[29:35]==r"/FTId=": 495 ft_id = line[35:70].rstrip()[:-1] 496 else: 497 ft_id ="" 498 if not name: # is continuation of last one 499 assert not from_res and not to_res 500 name, from_res, to_res, old_description,old_ft_id = record.features[-1] 501 del record.features[-1] 502 description = "%s %s" % (old_description, description) 503 504 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 505 if name == "VARSPLIC": 506 # Remove unwanted spaces in sequences. 507 # During line carryover, the sequences in VARSPLIC can get mangled 508 # with unwanted spaces like: 509 # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 510 # We want to check for this case and correct it as it happens. 511 descr_cols = description.split(" -> ") 512 if len(descr_cols) == 2: 513 first_seq, second_seq = descr_cols 514 extra_info = '' 515 # we might have more information at the end of the 516 # second sequence, which should be in parenthesis 517 extra_info_pos = second_seq.find(" (") 518 if extra_info_pos != -1: 519 extra_info = second_seq[extra_info_pos:] 520 second_seq = second_seq[:extra_info_pos] 521 # now clean spaces out of the first and second string 522 first_seq = first_seq.replace(" ", "") 523 second_seq = second_seq.replace(" ", "") 524 # reassemble the description 525 description = first_seq + " -> " + second_seq + extra_info 526 record.features.append((name, from_res, to_res, description,ft_id))
527 528 529 if __name__ == "__main__" : 530 print "Quick self test..." 531 532 example_filename = "../../Tests/SwissProt/sp008" 533 534 import os 535 if not os.path.isfile(example_filename): 536 print "Missing test file %s" % example_filename 537 else : 538 #Try parsing it! 539 540 handle = open(example_filename) 541 records = parse(handle) 542 for record in records: 543 print record.entry_name 544 print ",".join(record.accessions) 545 print record.keywords 546 print repr(record.organism) 547 print record.sequence[:20] + "..." 548 handle.close() 549