1
2
3
4
5
6
7 """Module for working with Prosite files from ExPASy (OBSOLETE).
8
9 Most of the functionality in this module has moved to Bio.ExPASy.Prosite;
10 please see
11
12 Bio.ExPASy.Prosite.read To read a Prosite file containing one entry.
13 Bio.ExPASy.Prosite.parse Iterates over entries in a Prosite file.
14 Bio.ExPASy.Prosite.Record Holds Prosite data.
15
16 For
17 scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
18 _extract_pattern_hits Extract Prosite patterns from a web page.
19 PatternHit Holds data from a hit against a Prosite pattern.
20 please see the new module Bio.ExPASy.ScanProsite.
21
22 The other functions and classes in Bio.Prosite (including
23 Bio.Prosite.index_file and Bio.Prosite.Dictionary) are considered deprecated,
24 and were not moved to Bio.ExPASy.Prosite. If you use this functionality,
25 please contact the Biopython developers at biopython-dev@biopython.org to
26 avoid permanent removal of this module from Biopython.
27
28
29 This module provides code to work with the prosite dat file from
30 Prosite.
31 http://www.expasy.ch/prosite/
32
33 Tested with:
34 Release 15.0, July 1998
35 Release 16.0, July 1999
36 Release 17.0, Dec 2001
37 Release 19.0, Mar 2006
38
39
40 Functions:
41 parse Iterates over entries in a Prosite file.
42 scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
43 index_file Index a Prosite file for a Dictionary.
44 _extract_record Extract Prosite data from a web page.
45 _extract_pattern_hits Extract Prosite patterns from a web page.
46
47
48 Classes:
49 Record Holds Prosite data.
50 PatternHit Holds data from a hit against a Prosite pattern.
51 Dictionary Accesses a Prosite file using a dictionary interface.
52 RecordParser Parses a Prosite record into a Record object.
53 Iterator Iterates over entries in a Prosite file; DEPRECATED.
54
55 _Scanner Scans Prosite-formatted data.
56 _RecordConsumer Consumes Prosite data to a Record object.
57
58 """
59 from types import *
60 import re
61 import sgmllib
62 from Bio import File
63 from Bio import Index
64 from Bio.ParserSupport import *
65
66
67
68
69
83
98
100 """Holds information from a Prosite record.
101
102 Members:
103 name ID of the record. e.g. ADH_ZINC
104 type Type of entry. e.g. PATTERN, MATRIX, or RULE
105 accession e.g. PS00387
106 created Date the entry was created. (MMM-YYYY)
107 data_update Date the 'primary' data was last updated.
108 info_update Date data other than 'primary' data was last updated.
109 pdoc ID of the PROSITE DOCumentation.
110
111 description Free-format description.
112 pattern The PROSITE pattern. See docs.
113 matrix List of strings that describes a matrix entry.
114 rules List of rule definitions (from RU lines). (strings)
115 prorules List of prorules (from PR lines). (strings)
116
117 NUMERICAL RESULTS
118 nr_sp_release SwissProt release.
119 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
120 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
121 nr_positive True positives. tuple of (hits, seqs)
122 nr_unknown Could be positives. tuple of (hits, seqs)
123 nr_false_pos False positives. tuple of (hits, seqs)
124 nr_false_neg False negatives. (int)
125 nr_partial False negatives, because they are fragments. (int)
126
127 COMMENTS
128 cc_taxo_range Taxonomic range. See docs for format
129 cc_max_repeat Maximum number of repetitions in a protein
130 cc_site Interesting site. list of tuples (pattern pos, desc.)
131 cc_skip_flag Can this entry be ignored?
132 cc_matrix_type
133 cc_scaling_db
134 cc_author
135 cc_ft_key
136 cc_ft_desc
137 cc_version version number (introduced in release 19.0)
138
139 DATA BANK REFERENCES - The following are all
140 lists of tuples (swiss-prot accession,
141 swiss-prot name)
142 dr_positive
143 dr_false_neg
144 dr_false_pos
145 dr_potential Potential hits, but fingerprint region not yet available.
146 dr_unknown Could possibly belong
147
148 pdb_structs List of PDB entries.
149
150 """
152 self.name = ''
153 self.type = ''
154 self.accession = ''
155 self.created = ''
156 self.data_update = ''
157 self.info_update = ''
158 self.pdoc = ''
159
160 self.description = ''
161 self.pattern = ''
162 self.matrix = []
163 self.rules = []
164 self.prorules = []
165 self.postprocessing = []
166
167 self.nr_sp_release = ''
168 self.nr_sp_seqs = ''
169 self.nr_total = (None, None)
170 self.nr_positive = (None, None)
171 self.nr_unknown = (None, None)
172 self.nr_false_pos = (None, None)
173 self.nr_false_neg = None
174 self.nr_partial = None
175
176 self.cc_taxo_range = ''
177 self.cc_max_repeat = ''
178 self.cc_site = []
179 self.cc_skip_flag = ''
180
181 self.dr_positive = []
182 self.dr_false_neg = []
183 self.dr_false_pos = []
184 self.dr_potential = []
185 self.dr_unknown = []
186
187 self.pdb_structs = []
188
190 """Holds information from a hit against a Prosite pattern.
191
192 Members:
193 name ID of the record. e.g. ADH_ZINC
194 accession e.g. PS00387
195 pdoc ID of the PROSITE DOCumentation.
196 description Free-format description.
197 matches List of tuples (start, end, sequence) where
198 start and end are indexes of the match, and sequence is
199 the sequence matched.
200
201 """
209 lines = []
210 lines.append("%s %s %s" % (self.accession, self.pdoc, self.name))
211 lines.append(self.description)
212 lines.append('')
213 if len(self.matches) > 1:
214 lines.append("Number of matches: %s" % len(self.matches))
215 for i in range(len(self.matches)):
216 start, end, seq = self.matches[i]
217 range_str = "%d-%d" % (start, end)
218 if len(self.matches) > 1:
219 lines.append("%7d %10s %s" % (i+1, range_str, seq))
220 else:
221 lines.append("%7s %10s %s" % (' ', range_str, seq))
222 return "\n".join(lines)
223
225 """Returns one record at a time from a Prosite file.
226
227 Methods:
228 next Return the next record from the stream, or None.
229
230 """
231 - def __init__(self, handle, parser=None):
232 """__init__(self, handle, parser=None)
233
234 Create a new iterator. handle is a file-like object. parser
235 is an optional Parser object to change the results into another form.
236 If set to None, then the raw contents of the file will be returned.
237
238 """
239 import warnings
240 warnings.warn("Bio.Prosite.Iterator is deprecated; we recommend using the function Bio.Prosite.parse instead. Please contact the Biopython developers at biopython-dev@biopython.org you cannot use Bio.Prosite.parse instead of Bio.Prosite.Iterator.",
241 DeprecationWarning)
242 if type(handle) is not FileType and type(handle) is not InstanceType:
243 raise ValueError("I expected a file handle or file-like object")
244 self._uhandle = File.UndoHandle(handle)
245 self._parser = parser
246
248 """next(self) -> object
249
250 Return the next Prosite record from the file. If no more records,
251 return None.
252
253 """
254
255 line = self._uhandle.peekline()
256 if line[:2] == 'CC':
257 while 1:
258 line = self._uhandle.readline()
259 if not line:
260 break
261 if line[:2] == '//':
262 break
263 if line[:2] != 'CC':
264 raise ValueError("Oops, where's the copyright?")
265
266 lines = []
267 while 1:
268 line = self._uhandle.readline()
269 if not line:
270 break
271 lines.append(line)
272 if line[:2] == '//':
273 break
274
275 if not lines:
276 return None
277
278 data = "".join(lines)
279 if self._parser is not None:
280 return self._parser.parse(File.StringHandle(data))
281 return data
282
284 return iter(self.next, None)
285
287 """Accesses a Prosite file using a dictionary interface.
288
289 """
290 __filename_key = '__filename'
291
292 - def __init__(self, indexname, parser=None):
293 """__init__(self, indexname, parser=None)
294
295 Open a Prosite Dictionary. indexname is the name of the
296 index for the dictionary. The index should have been created
297 using the index_file function. parser is an optional Parser
298 object to change the results into another form. If set to None,
299 then the raw contents of the file will be returned.
300
301 """
302 self._index = Index.Index(indexname)
303 self._handle = open(self._index[Dictionary.__filename_key])
304 self._parser = parser
305
307 return len(self._index)
308
316
318 return getattr(self._index, name)
319
321 """Parses Prosite data into a Record object.
322
323 """
327
328 - def parse(self, handle):
329 self._scanner.feed(handle, self._consumer)
330 return self._consumer.data
331
333 """Scans Prosite-formatted data.
334
335 Tested with:
336 Release 15.0, July 1998
337
338 """
339 - def feed(self, handle, consumer):
340 """feed(self, handle, consumer)
341
342 Feed in Prosite data for scanning. handle is a file-like
343 object that contains prosite data. consumer is a
344 Consumer object that will receive events as the report is scanned.
345
346 """
347 if isinstance(handle, File.UndoHandle):
348 uhandle = handle
349 else:
350 uhandle = File.UndoHandle(handle)
351
352 consumer.finished = False
353 while not consumer.finished:
354 line = uhandle.peekline()
355 if not line:
356 break
357 elif is_blank_line(line):
358
359 uhandle.readline()
360 continue
361 elif line[:2] == 'ID':
362 self._scan_record(uhandle, consumer)
363 elif line[:2] == 'CC':
364 self._scan_copyrights(uhandle, consumer)
365 else:
366 raise ValueError("There doesn't appear to be a record")
367
369 consumer.start_copyrights()
370 self._scan_line('CC', uhandle, consumer.copyright, any_number=1)
371 self._scan_terminator(uhandle, consumer)
372 consumer.end_copyrights()
373
386
387 - def _scan_line(self, line_type, uhandle, event_fn,
388 exactly_one=None, one_or_more=None, any_number=None,
389 up_to_one=None):
407
410
413
416
419
422
425
426
427
428
429
430
431
432
433
434
435
436
437
438
442
445
449
452
456
460
464
467
470
471
472
473
474 _scan_fns = [
475 _scan_id,
476 _scan_ac,
477 _scan_dt,
478 _scan_de,
479 _scan_pa,
480 _scan_ma,
481 _scan_pp,
482 _scan_ru,
483 _scan_nr,
484 _scan_cc,
485
486
487
488
489
490 _scan_ma,
491 _scan_nr,
492 _scan_cc,
493
494 _scan_dr,
495 _scan_3d,
496 _scan_pr,
497 _scan_do,
498 _scan_terminator
499 ]
500
502 """Consumer that converts a Prosite record to a Record object.
503
504 Members:
505 data Record with Prosite data.
506
507 """
510
513
516
518 cols = line.split()
519 if len(cols) != 3:
520 raise ValueError("I don't understand identification line\n%s" \
521 % line)
522 self.data.name = self._chomp(cols[1])
523 self.data.type = self._chomp(cols[2])
524
526 cols = line.split()
527 if len(cols) != 2:
528 raise ValueError("I don't understand accession line\n%s" % line)
529 self.data.accession = self._chomp(cols[1])
530
531 - def date(self, line):
532 uprline = line.upper()
533 cols = uprline.split()
534
535
536 if cols[2] != '(CREATED);' or \
537 cols[4] != '(DATA' or cols[5] != 'UPDATE);' or \
538 cols[7][:4] != '(INF' or cols[8] != 'UPDATE).':
539 raise ValueError("I don't understand date line\n%s" % line)
540
541 self.data.created = cols[1]
542 self.data.data_update = cols[3]
543 self.data.info_update = cols[6]
544
547
550
553
554 - def postprocessing(self, line):
557
558 - def rule(self, line):
560
562 cols = self._clean(line).split(";")
563 for col in cols:
564 if not col:
565 continue
566 qual, data = [word.lstrip() for word in col.split("=")]
567 if qual == '/RELEASE':
568 release, seqs = data.split(",")
569 self.data.nr_sp_release = release
570 self.data.nr_sp_seqs = int(seqs)
571 elif qual == '/FALSE_NEG':
572 self.data.nr_false_neg = int(data)
573 elif qual == '/PARTIAL':
574 self.data.nr_partial = int(data)
575 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
576 m = re.match(r'(\d+)\((\d+)\)', data)
577 if not m:
578 raise Exception("Broken data %s in comment line\n%s" \
579 % (repr(data), line))
580 hits = tuple(map(int, m.groups()))
581 if(qual == "/TOTAL"):
582 self.data.nr_total = hits
583 elif(qual == "/POSITIVE"):
584 self.data.nr_positive = hits
585 elif(qual == "/UNKNOWN"):
586 self.data.nr_unknown = hits
587 elif(qual == "/FALSE_POS"):
588 self.data.nr_false_pos = hits
589 else:
590 raise ValueError("Unknown qual %s in comment line\n%s" \
591 % (repr(qual), line))
592
634
653
658
663
666
669
670 - def _chomp(self, word, to_chomp='.,;'):
671
672 if word[-1] in to_chomp:
673 return word[:-1]
674 return word
675
676 - def _clean(self, line, rstrip=1):
677
678 if rstrip:
679 return line[5:].rstrip()
680 return line[5:]
681
683 """scan_sequence_expasy(seq=None, id=None, exclude_frequent=None) ->
684 list of PatternHit's
685
686 Search a sequence for occurrences of Prosite patterns. You can
687 specify either a sequence in seq or a SwissProt/trEMBL ID or accession
688 in id. Only one of those should be given. If exclude_frequent
689 is true, then the patterns with the high probability of occurring
690 will be excluded.
691
692 """
693 from Bio import ExPASy
694 if (seq and id) or not (seq or id):
695 raise ValueError("Please specify either a sequence or an id")
696 handle = ExPASy.scanprosite1(seq, id, exclude_frequent)
697 return _extract_pattern_hits(handle)
698
700 """_extract_pattern_hits(handle) -> list of PatternHit's
701
702 Extract hits from a web page. Raises a ValueError if there
703 was an error in the query.
704
705 """
706 class parser(sgmllib.SGMLParser):
707 def __init__(self):
708 sgmllib.SGMLParser.__init__(self)
709 self.hits = []
710 self.broken_message = 'Some error occurred'
711 self._in_pre = 0
712 self._current_hit = None
713 self._last_found = None
714 def handle_data(self, data):
715 if data.find('try again') >= 0:
716 self.broken_message = data
717 return
718 elif data == 'illegal':
719 self.broken_message = 'Sequence contains illegal characters'
720 return
721 if not self._in_pre:
722 return
723 elif not data.strip():
724 return
725 if self._last_found is None and data[:4] == 'PDOC':
726 self._current_hit.pdoc = data
727 self._last_found = 'pdoc'
728 elif self._last_found == 'pdoc':
729 if data[:2] != 'PS':
730 raise ValueError("Expected accession but got:\n%s" % data)
731 self._current_hit.accession = data
732 self._last_found = 'accession'
733 elif self._last_found == 'accession':
734 self._current_hit.name = data
735 self._last_found = 'name'
736 elif self._last_found == 'name':
737 self._current_hit.description = data
738 self._last_found = 'description'
739 elif self._last_found == 'description':
740 m = re.findall(r'(\d+)-(\d+) (\w+)', data)
741 for start, end, seq in m:
742 self._current_hit.matches.append(
743 (int(start), int(end), seq))
744
745 def do_hr(self, attrs):
746
747 if self._in_pre:
748 self._current_hit = PatternHit()
749 self.hits.append(self._current_hit)
750 self._last_found = None
751 def start_pre(self, attrs):
752 self._in_pre = 1
753 self.broken_message = None
754 def end_pre(self):
755 self._in_pre = 0
756 p = parser()
757 p.feed(handle.read())
758 if p.broken_message:
759 raise ValueError(p.broken_message)
760 return p.hits
761
762
763
764
765 -def index_file(filename, indexname, rec2key=None):
766 """index_file(filename, indexname, rec2key=None)
767
768 Index a Prosite file. filename is the name of the file.
769 indexname is the name of the dictionary. rec2key is an
770 optional callback that takes a Record and generates a unique key
771 (e.g. the accession number) for the record. If not specified,
772 the id name will be used.
773
774 """
775 import os
776 if not os.path.exists(filename):
777 raise ValueError("%s does not exist" % filename)
778
779 index = Index.Index(indexname, truncate=1)
780 index[Dictionary._Dictionary__filename_key] = filename
781
782 handle = open(filename)
783 records = parse(handle)
784 end = 0L
785 for record in records:
786 start = end
787 end = long(handle.tell())
788 length = end - start
789
790 if rec2key is not None:
791 key = rec2key(record)
792 else:
793 key = record.name
794
795 if not key:
796 raise KeyError("empty key was produced")
797 elif key in index:
798 raise KeyError("duplicate key %s found" % key)
799
800 index[key] = start, length
801