1
2
3
4
5 """
6 This module provides code to work with the sprotXX.dat file from
7 SwissProt.
8 http://www.expasy.ch/sprot/sprot-top.html
9
10 Tested with:
11 Release 56.9, 03-March-2009.
12
13
14 Classes:
15 Record Holds SwissProt data.
16 Reference Holds reference data from a SwissProt record.
17
18 Functions:
19 read Read one SwissProt record
20 parse Read multiple SwissProt records
21
22 """
23
24
26 """Holds information from a SwissProt record.
27
28 Members:
29 entry_name Name of this entry, e.g. RL1_ECOLI.
30 data_class Either 'STANDARD' or 'PRELIMINARY'.
31 molecule_type Type of molecule, 'PRT',
32 sequence_length Number of residues.
33
34 accessions List of the accession numbers, e.g. ['P00321']
35 created A tuple of (date, release).
36 sequence_update A tuple of (date, release).
37 annotation_update A tuple of (date, release).
38
39 description Free-format description.
40 gene_name Gene name. See userman.txt for description.
41 organism The source of the sequence.
42 organelle The origin of the sequence.
43 organism_classification The taxonomy classification. List of strings.
44 (http://www.ncbi.nlm.nih.gov/Taxonomy/)
45 taxonomy_id A list of NCBI taxonomy id's.
46 host_organism A list of NCBI taxonomy id's of the hosts of a virus,
47 if any.
48 references List of Reference objects.
49 comments List of strings.
50 cross_references List of tuples (db, id1[, id2][, id3]). See the docs.
51 keywords List of the keywords.
52 features List of tuples (key name, from, to, description).
53 from and to can be either integers for the residue
54 numbers, '<', '>', or '?'
55
56 seqinfo tuple of (length, molecular weight, CRC32 value)
57 sequence The sequence.
58
59 """
61 self.entry_name = None
62 self.data_class = None
63 self.molecule_type = None
64 self.sequence_length = None
65
66 self.accessions = []
67 self.created = None
68 self.sequence_update = None
69 self.annotation_update = None
70
71 self.description = ''
72 self.gene_name = ''
73 self.organism = ''
74 self.organelle = ''
75 self.organism_classification = []
76 self.taxonomy_id = []
77 self.host_organism = []
78 self.references = []
79 self.comments = []
80 self.cross_references = []
81 self.keywords = []
82 self.features = []
83
84 self.seqinfo = None
85 self.sequence = ''
86
87
89 """Holds information from one reference in a SwissProt entry.
90
91 Members:
92 number Number of reference in an entry.
93 positions Describes extent of work. list of strings.
94 comments Comments. List of (token, text).
95 references References. List of (dbname, identifier)
96 authors The authors of the work.
97 title Title of the work.
98 location A citation for the work.
99
100 """
102 self.number = None
103 self.positions = []
104 self.comments = []
105 self.references = []
106 self.authors = ''
107 self.title = ''
108 self.location = ''
109
110
117
118
128
129
130
131
132
134 record = None
135 for line in handle:
136 key, value = line[:2], line[5:].rstrip()
137 if key=='**':
138
139
140
141
142
143
144 pass
145 elif key=='ID':
146 record = Record()
147 _read_id(record, line)
148 _sequence_lines = []
149 elif key=='AC':
150 accessions = [word for word in value.rstrip(";").split("; ")]
151 record.accessions.extend(accessions)
152 elif key=='DT':
153 _read_dt(record, line)
154 elif key=='DE':
155 record.description += line[5:]
156 elif key=='GN':
157 record.gene_name += line[5:]
158 elif key=='OS':
159 record.organism += line[5:]
160 elif key=='OG':
161 record.organelle += line[5:]
162 elif key=='OC':
163 cols = [col for col in value.rstrip(";.").split("; ")]
164 record.organism_classification.extend(cols)
165 elif key=='OX':
166 _read_ox(record, line)
167 elif key=='OH':
168 _read_oh(record, line)
169 elif key=='RN':
170 reference = Reference()
171 _read_rn(reference, value)
172 record.references.append(reference)
173 elif key=='RP':
174 assert record.references, "RP: missing RN"
175 record.references[-1].positions.append(value)
176 elif key=='RC':
177 assert record.references, "RC: missing RN"
178 reference = record.references[-1]
179 _read_rc(reference, value)
180 elif key=='RX':
181 assert record.references, "RX: missing RN"
182 reference = record.references[-1]
183 _read_rx(reference, value)
184 elif key=='RL':
185 assert record.references, "RL: missing RN"
186 reference = record.references[-1]
187 reference.location += line[5:]
188
189
190
191 elif key=='RA':
192 assert record.references, "RA: missing RN"
193 reference = record.references[-1]
194 reference.authors += line[5:]
195 elif key=='RG':
196 assert record.references, "RG: missing RN"
197 reference = record.references[-1]
198 reference.authors += line[5:]
199 elif key=="RT":
200 assert record.references, "RT: missing RN"
201 reference = record.references[-1]
202 reference.title += line[5:]
203 elif key=='CC':
204 _read_cc(record, line)
205 elif key=='DR':
206 _read_dr(record, value)
207 elif key=='PE':
208
209 pass
210 elif key=='KW':
211 cols = value.rstrip(";.").split('; ')
212 record.keywords.extend(cols)
213 elif key=='FT':
214 _read_ft(record, line)
215 elif key=='SQ':
216 cols = value.split()
217 assert len(cols) == 7, "I don't understand SQ line %s" % line
218
219 record.seqinfo = int(cols[1]), int(cols[3]), cols[5]
220 elif key==' ':
221 _sequence_lines.append(value.replace(" ", "").rstrip())
222 elif key=='//':
223
224 record.description = record.description.rstrip()
225 record.gene_name = record.gene_name.rstrip()
226 record.organism = record.organism.rstrip()
227 record.organelle = record.organelle.rstrip()
228 for reference in record.references:
229
230 reference.authors = reference.authors.rstrip()
231 reference.title = reference.title.rstrip()
232 reference.location = reference.location.rstrip()
233 record.sequence = "".join(_sequence_lines)
234 return record
235 else:
236 raise ValueError("Unknown keyword %s found" % keyword)
237 if record:
238 raise ValueError("Unexpected end of stream.")
239
240
242 cols = line[5:].split()
243
244
245
246
247
248 if len(cols) == 5 :
249 record.entry_name = cols[0]
250 record.data_class = cols[1].rstrip(";")
251 record.molecule_type = cols[2].rstrip(";")
252 record.sequence_length = int(cols[3])
253 elif len(cols) == 4 :
254 record.entry_name = cols[0]
255 record.data_class = cols[1].rstrip(";")
256 record.molecule_type = None
257 record.sequence_length = int(cols[2])
258 else :
259 raise ValueError("ID line has unrecognised format:\n"+line)
260
261 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed')
262 if record.data_class not in allowed:
263 raise ValueError("Unrecognized data class %s in line\n%s" % \
264 (record.data_class, line))
265
266
267 if record.molecule_type not in (None, 'PRT'):
268 raise ValueError("Unrecognized molecule type %s in line\n%s" % \
269 (record.molecule_type, line))
270
271
273 value = line[5:]
274 uprline = value.upper()
275 cols = value.rstrip().split()
276 if 'CREATED' in uprline \
277 or 'LAST SEQUENCE UPDATE' in uprline \
278 or 'LAST ANNOTATION UPDATE' in uprline:
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294 uprcols = uprline.split()
295 rel_index = -1
296 for index in range(len(uprcols)):
297 if uprcols[index].find("REL.") >= 0:
298 rel_index = index
299 assert rel_index >= 0, \
300 "Could not find Rel. in DT line: %s" % line
301 version_index = rel_index + 1
302
303 str_version = cols[version_index].rstrip(",")
304
305 if str_version == '':
306 version = 0
307
308 elif str_version.find(".") >= 0:
309 version = str_version
310
311 else:
312 version = int(str_version)
313
314 if 'CREATED' in uprline:
315 record.created = cols[1], version
316 elif 'LAST SEQUENCE UPDATE' in uprline:
317 record.sequence_update = cols[1], version
318 elif 'LAST ANNOTATION UPDATE' in uprline:
319 record.annotation_update = cols[1], version
320 else:
321 assert False, "Shouldn't reach this line!"
322 elif 'INTEGRATED INTO' in uprline \
323 or 'SEQUENCE VERSION' in uprline \
324 or 'ENTRY VERSION' in uprline:
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345 try:
346 version = int(cols[-1])
347 except ValueError :
348 version = 0
349
350
351
352 if "INTEGRATED" in uprline:
353 record.created = cols[1], version
354 elif 'SEQUENCE VERSION' in uprline:
355 record.sequence_update = cols[1], version
356 elif 'ENTRY VERSION' in uprline:
357 record.annotation_update = cols[1], version
358 else:
359 assert False, "Shouldn't reach this line!"
360 else:
361 raise ValueError("I don't understand the date line %s" % line)
362
363
381
382
384
385
386 if record.host_organism:
387 ids = line[5:].rstrip().rstrip(";")
388 else:
389 descr, ids = line[5:].rstrip().rstrip(";").split("=")
390 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
391 record.host_organism.extend(ids.split(', '))
392
393
395 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
396 reference.number = int(rn[1:-1])
397
398
414
415
417
418
419
420
421
422
423
424
425 value = value.replace(' [NCBI, ExPASy, Israel, Japan]','')
426
427
428
429
430
431
432
433
434 if "=" in value:
435 cols = value.split("; ")
436 cols = [x.strip() for x in cols]
437 cols = [x for x in cols if x]
438 for col in cols:
439 x = col.split("=")
440 assert len(x) == 2, "I don't understand RX line %s" % line
441 key, value = x[0], x[1].rstrip(";")
442 reference.references.append((key, value))
443
444 else:
445 cols = value.split("; ")
446
447 assert len(cols) == 2, "I don't understand RX line %s" % line
448 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
449
450
470
471
479
480
482 line = line[5:]
483 name = line[0:8].rstrip()
484 try:
485 from_res = int(line[9:15])
486 except ValueError:
487 from_res = line[9:15].lstrip()
488 try:
489 to_res = int(line[16:22])
490 except ValueError:
491 to_res = line[16:22].lstrip()
492 description = line[29:70].rstrip()
493
494 if line[29:35]==r"/FTId=":
495 ft_id = line[35:70].rstrip()[:-1]
496 else:
497 ft_id =""
498 if not name:
499 assert not from_res and not to_res
500 name, from_res, to_res, old_description,old_ft_id = record.features[-1]
501 del record.features[-1]
502 description = "%s %s" % (old_description, description)
503
504
505 if name == "VARSPLIC":
506
507
508
509
510
511 descr_cols = description.split(" -> ")
512 if len(descr_cols) == 2:
513 first_seq, second_seq = descr_cols
514 extra_info = ''
515
516
517 extra_info_pos = second_seq.find(" (")
518 if extra_info_pos != -1:
519 extra_info = second_seq[extra_info_pos:]
520 second_seq = second_seq[:extra_info_pos]
521
522 first_seq = first_seq.replace(" ", "")
523 second_seq = second_seq.replace(" ", "")
524
525 description = first_seq + " -> " + second_seq + extra_info
526 record.features.append((name, from_res, to_res, description,ft_id))
527
528
529 if __name__ == "__main__" :
530 print "Quick self test..."
531
532 example_filename = "../../Tests/SwissProt/sp008"
533
534 import os
535 if not os.path.isfile(example_filename):
536 print "Missing test file %s" % example_filename
537 else :
538
539
540 handle = open(example_filename)
541 records = parse(handle)
542 for record in records:
543 print record.entry_name
544 print ",".join(record.accessions)
545 print record.keywords
546 print repr(record.organism)
547 print record.sequence[:20] + "..."
548 handle.close()
549