1
2
3
4
5
6 """
7 This module provides code to work with the sprotXX.dat file from
8 Utilities for working with FASTA-formatted sequences (OBSOLETE).
9 http://www.expasy.ch/sprot/sprot-top.html
10
11 Please see Bio.SwissProt for alternatives for the functionality in this module.
12
13 Tested with:
14 Release 37, Release 38, Release 39
15
16 Limited testing with:
17 Release 51, 54
18
19
20 Classes:
21 Record Holds SwissProt data.
22 Reference Holds reference data from a SwissProt entry.
23 Iterator Iterates over entries in a SwissProt file.
24 Dictionary Accesses a SwissProt file using a dictionary interface.
25 RecordParser Parses a SwissProt record into a Record object.
26 SequenceParser Parses a SwissProt record into a SeqRecord object.
27
28 _Scanner Scans SwissProt-formatted data.
29 _RecordConsumer Consumes SwissProt data to a SProt.Record object.
30 _SequenceConsumer Consumes SwissProt data to a SeqRecord object.
31
32
33 Functions:
34 index_file Index a SwissProt file for a Dictionary.
35
36 """
37 from types import *
38 import os
39 from Bio import File
40 from Bio import Index
41 from Bio import Alphabet
42 from Bio import Seq
43 from Bio import SeqRecord
44 from Bio.ParserSupport import *
45
46
47
60
76
77
78 _CHOMP = " \n\r\t.,;"
79
81 """Holds information from a SwissProt record.
82
83 Members:
84 entry_name Name of this entry, e.g. RL1_ECOLI.
85 data_class Either 'STANDARD' or 'PRELIMINARY'.
86 molecule_type Type of molecule, 'PRT',
87 sequence_length Number of residues.
88
89 accessions List of the accession numbers, e.g. ['P00321']
90 created A tuple of (date, release).
91 sequence_update A tuple of (date, release).
92 annotation_update A tuple of (date, release).
93
94 description Free-format description.
95 gene_name Gene name. See userman.txt for description.
96 organism The source of the sequence.
97 organelle The origin of the sequence.
98 organism_classification The taxonomy classification. List of strings.
99 (http://www.ncbi.nlm.nih.gov/Taxonomy/)
100 taxonomy_id A list of NCBI taxonomy id's.
101 host_organism A list of NCBI taxonomy id's of the hosts of a virus,
102 if any.
103 references List of Reference objects.
104 comments List of strings.
105 cross_references List of tuples (db, id1[, id2][, id3]). See the docs.
106 keywords List of the keywords.
107 features List of tuples (key name, from, to, description).
108 from and to can be either integers for the residue
109 numbers, '<', '>', or '?'
110
111 seqinfo tuple of (length, molecular weight, CRC32 value)
112 sequence The sequence.
113
114 """
116 self.entry_name = None
117 self.data_class = None
118 self.molecule_type = None
119 self.sequence_length = None
120
121 self.accessions = []
122 self.created = None
123 self.sequence_update = None
124 self.annotation_update = None
125
126 self.description = ''
127 self.gene_name = ''
128 self.organism = ''
129 self.organelle = ''
130 self.organism_classification = []
131 self.taxonomy_id = []
132 self.host_organism = []
133 self.references = []
134 self.comments = []
135 self.cross_references = []
136 self.keywords = []
137 self.features = []
138
139 self.seqinfo = None
140 self.sequence = ''
141
143 """Holds information from 1 references in a SwissProt entry.
144
145 Members:
146 number Number of reference in an entry.
147 positions Describes extent of work. list of strings.
148 comments Comments. List of (token, text).
149 references References. List of (dbname, identifier)
150 authors The authors of the work.
151 title Title of the work.
152 location A citation for the work.
153
154 """
156 self.number = None
157 self.positions = []
158 self.comments = []
159 self.references = []
160 self.authors = ''
161 self.title = ''
162 self.location = ''
163
165 """Returns one record at a time from a SwissProt file.
166
167 Methods:
168 next Return the next record from the stream, or None.
169
170 """
171 - def __init__(self, handle, parser=None):
172 """__init__(self, handle, parser=None)
173
174 Create a new iterator. handle is a file-like object. parser
175 is an optional Parser object to change the results into another form.
176 If set to None, then the raw contents of the file will be returned.
177
178 """
179 import warnings
180 warnings.warn("Bio.SwissProt.SProt.Iterator is deprecated. Please use the function Bio.SwissProt.parse instead if you want to get a SwissProt.SProt.Record, or Bio.SeqIO.parse if you want to get a SeqRecord. If these solutions do not work for you, please get in contact with the Biopython developers (biopython-dev@biopython.org).",
181 DeprecationWarning)
182
183 if type(handle) is not FileType and type(handle) is not InstanceType:
184 raise ValueError("I expected a file handle or file-like object")
185 self._uhandle = File.UndoHandle(handle)
186 self._parser = parser
187
189 """next(self) -> object
190
191 Return the next swissprot record from the file. If no more records,
192 return None.
193
194 """
195 lines = []
196 while 1:
197 line = self._uhandle.readline()
198 if not line:
199 break
200 lines.append(line)
201 if line[:2] == '//':
202 break
203
204 if not lines:
205 return None
206
207 data = ''.join(lines)
208 if self._parser is not None:
209 return self._parser.parse(File.StringHandle(data))
210 return data
211
213 return iter(self.next, None)
214
216 """Accesses a SwissProt file using a dictionary interface.
217
218 """
219 __filename_key = '__filename'
220
221 - def __init__(self, indexname, parser=None):
222 """__init__(self, indexname, parser=None)
223
224 Open a SwissProt Dictionary. indexname is the name of the
225 index for the dictionary. The index should have been created
226 using the index_file function. parser is an optional Parser
227 object to change the results into another form. If set to None,
228 then the raw contents of the file will be returned.
229
230 """
231 self._index = Index.Index(indexname)
232 self._handle = open(self._index[self.__filename_key])
233 self._parser = parser
234
236 return len(self._index)
237
245
247 return getattr(self._index, name)
248
254
255
257 """Parses SwissProt data into a Record object.
258
259 """
263
264 - def parse(self, handle):
265 self._scanner.feed(handle, self._consumer)
266 return self._consumer.data
267
269 """Parses SwissProt data into a standard SeqRecord object.
270 """
272 """Initialize a SequenceParser.
273
274 Arguments:
275 o alphabet - The alphabet to use for the generated Seq objects. If
276 not supplied this will default to the generic protein alphabet.
277 """
278 self._scanner = _Scanner()
279 self._consumer = _SequenceConsumer(alphabet)
280
281 - def parse(self, handle):
282 self._scanner.feed(handle, self._consumer)
283 return self._consumer.data
284
286 """Scans SwissProt-formatted data.
287
288 Tested with:
289 Release 37
290 Release 38
291 """
292
293 - def feed(self, handle, consumer):
294 """feed(self, handle, consumer)
295
296 Feed in SwissProt data for scanning. handle is a file-like
297 object that contains swissprot data. consumer is a
298 Consumer object that will receive events as the report is scanned.
299
300 """
301 if isinstance(handle, File.UndoHandle):
302 uhandle = handle
303 else:
304 uhandle = File.UndoHandle(handle)
305 self._scan_record(uhandle, consumer)
306
308 """Ignores any lines starting **"""
309
310
311
312
313
314
315
316 while "**" == uhandle.peekline()[:2] :
317 skip = uhandle.readline()
318
319
333
334 - def _scan_line(self, line_type, uhandle, event_fn,
335 exactly_one=None, one_or_more=None, any_number=None,
336 up_to_one=None):
354
357
363
369
373
376
380
383
387
391
395
413
417
421
425
429
443
447
453
456
460
463
466
469
472
475
478
479 _scan_fns = [
480 _scan_id,
481 _scan_ac,
482 _scan_dt,
483 _scan_de,
484 _scan_gn,
485 _scan_os,
486 _scan_og,
487 _scan_oc,
488 _scan_ox,
489 _scan_oh,
490 _scan_reference,
491 _scan_cc,
492 _scan_dr,
493 _scan_pe,
494 _scan_kw,
495 _scan_ft,
496 _scan_sq,
497 _scan_sequence_data,
498 _scan_terminator
499 ]
500
502 """Consumer that converts a SwissProt record to a Record object.
503
504 Members:
505 data Record with SwissProt data.
506
507 """
510
512 return "Bio.SwissProt.SProt._RecordConsumer()"
513
515 self.data = Record()
516 self._sequence_lines = []
517
521
523 cols = line.split()
524
525
526
527
528
529
530
531
532 if len(cols) == 6 :
533 self.data.entry_name = cols[1]
534 self.data.data_class = cols[2].rstrip(_CHOMP)
535 self.data.molecule_type = cols[3].rstrip(_CHOMP)
536 self.data.sequence_length = int(cols[4])
537 elif len(cols) == 5 :
538 self.data.entry_name = cols[1]
539 self.data.data_class = cols[2].rstrip(_CHOMP)
540 self.data.molecule_type = None
541 self.data.sequence_length = int(cols[3])
542 else :
543
544 raise ValueError("ID line has unrecognised format:\n"+line)
545
546
547
548
549 if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI',
550 'Reviewed', 'Unreviewed']:
551 raise ValueError("Unrecognized data class %s in line\n%s" % \
552 (self.data.data_class, line))
553
554
555 if self.data.molecule_type is not None \
556 and self.data.molecule_type != 'PRT':
557 raise ValueError("Unrecognized molecule type %s in line\n%s" % \
558 (self.data.molecule_type, line))
559
566
567 - def date(self, line):
568 uprline = line.upper()
569 cols = line.rstrip().split()
570
571 if uprline.find('CREATED') >= 0 \
572 or uprline.find('LAST SEQUENCE UPDATE') >= 0 \
573 or uprline.find('LAST ANNOTATION UPDATE') >= 0:
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589 uprcols = uprline.split()
590 rel_index = -1
591 for index in range(len(uprcols)):
592 if uprcols[index].find("REL.") >= 0:
593 rel_index = index
594 assert rel_index >= 0, \
595 "Could not find Rel. in DT line: %s" % (line)
596 version_index = rel_index + 1
597
598 str_version = cols[version_index].rstrip(_CHOMP)
599
600 if str_version == '':
601 version = 0
602
603 elif str_version.find(".") >= 0:
604 version = str_version
605
606 else:
607 version = int(str_version)
608
609 if uprline.find('CREATED') >= 0:
610 self.data.created = cols[1], version
611 elif uprline.find('LAST SEQUENCE UPDATE') >= 0:
612 self.data.sequence_update = cols[1], version
613 elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0:
614 self.data.annotation_update = cols[1], version
615 else:
616 assert False, "Shouldn't reach this line!"
617 elif uprline.find('INTEGRATED INTO') >= 0 \
618 or uprline.find('SEQUENCE VERSION') >= 0 \
619 or uprline.find('ENTRY VERSION') >= 0:
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640 try:
641 version = int(cols[-1])
642 except ValueError :
643 version = 0
644
645
646
647 if uprline.find("INTEGRATED") >= 0:
648 self.data.created = cols[1], version
649 elif uprline.find('SEQUENCE VERSION') >= 0:
650 self.data.sequence_update = cols[1], version
651 elif uprline.find( 'ENTRY VERSION') >= 0:
652 self.data.annotation_update = cols[1], version
653 else:
654 assert False, "Shouldn't reach this line!"
655 else:
656 raise ValueError("I don't understand the date line %s" % line)
657
660
663
666
669
675
697
710
712 rn = line[5:].rstrip()
713 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
714 ref = Reference()
715 ref.number = int(rn[1:-1])
716 self.data.references.append(ref)
717
719 assert self.data.references, "RP: missing RN"
720 self.data.references[-1].positions.append(line[5:].rstrip())
721
749
751 assert self.data.references, "RX: missing RN"
752
753
754
755
756
757
758
759
760 ind = line.find('[NCBI, ExPASy, Israel, Japan]')
761 if ind >= 0:
762 line = line[:ind]
763
764
765
766
767
768
769
770
771 if line.find("=") != -1:
772 cols = line[2:].split("; ")
773 cols = [x.strip() for x in cols]
774 cols = [x for x in cols if x]
775 for col in cols:
776 x = col.split("=")
777 assert len(x) == 2, "I don't understand RX line %s" % line
778 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP)
779 ref = self.data.references[-1].references
780 ref.append((key, value))
781
782 else:
783 cols = line.split()
784
785 assert len(cols) == 3, "I don't understand RX line %s" % line
786 self.data.references[-1].references.append(
787 (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP)))
788
790 assert self.data.references, "RA: missing RN"
791 ref = self.data.references[-1]
792 ref.authors += line[5:]
793
795 assert self.data.references, "RT: missing RN"
796 ref = self.data.references[-1]
797 ref.title += line[5:]
798
800 assert self.data.references, "RL: missing RN"
801 ref = self.data.references[-1]
802 ref.location += line[5:]
803
822
824
825
826
827
828 line = line[5:]
829
830 i = line.find('[')
831 if i >= 0:
832 line = line[:i]
833 cols = line.rstrip(_CHOMP).split(';')
834 cols = [col.lstrip() for col in cols]
835 self.data.cross_references.append(tuple(cols))
836
840
868
870 """Remove unwanted spaces in sequences.
871
872 During line carryover, the sequences in VARSPLIC can get mangled
873 with unwanted spaces like:
874 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
875 We want to check for this case and correct it as it happens.
876 """
877 descr_cols = description.split(" -> ")
878 if len(descr_cols) == 2:
879 first_seq = descr_cols[0]
880 second_seq = descr_cols[1]
881 extra_info = ''
882
883
884 extra_info_pos = second_seq.find(" (")
885 if extra_info_pos != -1:
886 extra_info = second_seq[extra_info_pos:]
887 second_seq = second_seq[:extra_info_pos]
888
889
890 first_seq = first_seq.replace(" ", "")
891 second_seq = second_seq.replace(" ", "")
892
893
894 description = first_seq + " -> " + second_seq + extra_info
895
896 return description
897
901
903 cols = line.split()
904 assert len(cols) == 8, "I don't understand SQ line %s" % line
905
906 self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6]
907
909
910 self._sequence_lines.append(line.replace(" ", "").rstrip())
911
914
915
916
917
918
919
921
922 members = ['description', 'gene_name', 'organism', 'organelle']
923 for m in members:
924 attr = getattr(rec, m)
925 setattr(rec, m, attr.rstrip())
926 for ref in rec.references:
927 self._clean_references(ref)
928
930
931 members = ['authors', 'title', 'location']
932 for m in members:
933 attr = getattr(ref, m)
934 setattr(ref, m, attr.rstrip())
935
937 """Consumer that converts a SwissProt record to a SeqRecord object.
938
939 Members:
940 data Record with SwissProt data.
941 alphabet The alphabet the generated Seq objects will have.
942 """
943
945 """Initialize a Sequence Consumer
946
947 Arguments:
948 o alphabet - The alphabet to use for the generated Seq objects. If
949 not supplied this will default to the generic protein alphabet.
950 """
951 self.data = None
952 self.alphabet = alphabet
953
961
968
972
987
988
992
994
995 self._sequence_lines.append(line.replace(" ", "").rstrip())
996
1003
1012
1013
1032
1033
1034
1035
1036 - def date(self, line):
1037 date_str = line.split()[0]
1038 uprline = line.upper()
1039 if uprline.find('CREATED') >= 0 :
1040
1041
1042 self.data.annotations['date'] = date_str
1043 elif uprline.find('LAST SEQUENCE UPDATE') >= 0 :
1044
1045 self.data.annotations['date_last_sequence_update'] = date_str
1046 elif uprline.find('LAST ANNOTATION UPDATE') >= 0:
1047
1048 self.data.annotations['date_last_annotation_update'] = date_str
1049
1062
1072
1089
1102
1104
1105
1106
1107
1108
1109 line = line[5:].rstrip(_CHOMP)
1110 index = line.find('=')
1111 if index >= 0:
1112 descr = line[:index]
1113 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr
1114 ids = line[index+1:].split(',')
1115 else:
1116 ids = line.split(',')
1117
1118 try :
1119
1120 self.data.annotations['ncbi_taxid'].extend(ids)
1121 except KeyError:
1122 self.data.annotations['ncbi_taxid'] = ids
1123
1135
1137 """RP line, reference position."""
1138 assert self._current_ref is not None, "RP: missing RN"
1139
1140
1141
1142 pass
1143
1145 """RX line, reference cross-references."""
1146 assert self._current_ref is not None, "RX: missing RN"
1147
1148
1149
1150
1151
1152
1153 if line.find("=") != -1:
1154 cols = line[2:].split("; ")
1155 cols = [x.strip() for x in cols]
1156 cols = [x for x in cols if x]
1157 for col in cols:
1158 x = col.split("=")
1159 assert len(x) == 2, "I don't understand RX line %s" % line
1160 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP)
1161 if key == "MEDLINE" :
1162 self._current_ref.medline_id = value
1163 elif key == "PubMed" :
1164 self._current_ref.pubmed_id = value
1165 else :
1166
1167
1168 pass
1169
1170 else:
1171
1172
1173
1174
1175 ind = line.find('[NCBI, ExPASy, Israel, Japan]')
1176 if ind >= 0:
1177 line = line[:ind]
1178 cols = line.split()
1179
1180 assert len(cols) == 3, "I don't understand RX line %s" % line
1181 key = cols[1].rstrip(_CHOMP)
1182 value = cols[2].rstrip(_CHOMP)
1183 if key == "MEDLINE" :
1184 self._current_ref.medline_id = value
1185 elif key == "PubMed" :
1186 self._current_ref.pubmed_id = value
1187 else :
1188
1189
1190 pass
1191
1193 """RA line, reference author(s)."""
1194 assert self._current_ref is not None, "RA: missing RN"
1195 self._current_ref.authors += line[5:].rstrip("\n")
1196
1198 """RT line, reference title."""
1199 assert self._current_ref is not None, "RT: missing RN"
1200 self._current_ref.title += line[5:].rstrip("\n")
1201
1203 """RL line, reference 'location' - journal, volume, pages, year."""
1204 assert self._current_ref is not None, "RL: missing RN"
1205 self._current_ref.journal += line[5:].rstrip("\n")
1206
1213
1214 -def index_file(filename, indexname, rec2key=None):
1215 """index_file(filename, indexname, rec2key=None)
1216
1217 Index a SwissProt file. filename is the name of the file.
1218 indexname is the name of the dictionary. rec2key is an
1219 optional callback that takes a Record and generates a unique key
1220 (e.g. the accession number) for the record. If not specified,
1221 the entry name will be used.
1222
1223 """
1224 from Bio.SwissProt import parse
1225 if not os.path.exists(filename):
1226 raise ValueError("%s does not exist" % filename)
1227
1228 index = Index.Index(indexname, truncate=1)
1229 index[Dictionary._Dictionary__filename_key] = filename
1230
1231 handle = open(filename)
1232 records = parse(handle)
1233 end = 0L
1234 for record in records:
1235 start = end
1236 end = long(handle.tell())
1237 length = end - start
1238
1239 if rec2key is not None:
1240 key = rec2key(record)
1241 else:
1242 key = record.entry_name
1243
1244 if not key:
1245 raise KeyError("empty sequence key was produced")
1246 elif key in index:
1247 raise KeyError("duplicate key %s found" % key)
1248
1249 index[key] = start, length
1250