1
2
3
4
5
6
7 """Module for working with Prosite files from ExPASy (OBSOLETE).
8
9 Most of the functionality in this module has moved to Bio.ExPASy.Prosite;
10 please see
11
12 Bio.ExPASy.Prosite.read To read a Prosite file containing one entry.
13 Bio.ExPASy.Prosite.parse Iterates over entries in a Prosite file.
14 Bio.ExPASy.Prosite.Record Holds Prosite data.
15
16 For
17 scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
18 _extract_pattern_hits Extract Prosite patterns from a web page.
19 PatternHit Holds data from a hit against a Prosite pattern.
20 please see the new module Bio.ExPASy.ScanProsite.
21
22 The other functions and classes in Bio.Prosite (including
23 Bio.Prosite.index_file and Bio.Prosite.Dictionary) are considered deprecated,
24 and were not moved to Bio.ExPASy.Prosite. If you use this functionality,
25 please contact the Biopython developers at biopython-dev@biopython.org to
26 avoid permanent removal of this module from Biopython.
27
28
29 This module provides code to work with the prosite dat file from
30 Prosite.
31 http://www.expasy.ch/prosite/
32
33 Tested with:
34 Release 15.0, July 1998
35 Release 16.0, July 1999
36 Release 17.0, Dec 2001
37 Release 19.0, Mar 2006
38
39
40 Functions:
41 parse Iterates over entries in a Prosite file.
42 scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
43 index_file Index a Prosite file for a Dictionary.
44 _extract_record Extract Prosite data from a web page.
45 _extract_pattern_hits Extract Prosite patterns from a web page.
46
47
48 Classes:
49 Record Holds Prosite data.
50 PatternHit Holds data from a hit against a Prosite pattern.
51 Dictionary Accesses a Prosite file using a dictionary interface.
52 RecordParser Parses a Prosite record into a Record object.
53
54 _Scanner Scans Prosite-formatted data.
55 _RecordConsumer Consumes Prosite data to a Record object.
56
57 """
58 from types import *
59 import re
60 import sgmllib
61 from Bio import File
62 from Bio import Index
63 from Bio.ParserSupport import *
64
65
66
67
68
82
97
99 """Holds information from a Prosite record.
100
101 Members:
102 name ID of the record. e.g. ADH_ZINC
103 type Type of entry. e.g. PATTERN, MATRIX, or RULE
104 accession e.g. PS00387
105 created Date the entry was created. (MMM-YYYY)
106 data_update Date the 'primary' data was last updated.
107 info_update Date data other than 'primary' data was last updated.
108 pdoc ID of the PROSITE DOCumentation.
109
110 description Free-format description.
111 pattern The PROSITE pattern. See docs.
112 matrix List of strings that describes a matrix entry.
113 rules List of rule definitions (from RU lines). (strings)
114 prorules List of prorules (from PR lines). (strings)
115
116 NUMERICAL RESULTS
117 nr_sp_release SwissProt release.
118 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
119 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
120 nr_positive True positives. tuple of (hits, seqs)
121 nr_unknown Could be positives. tuple of (hits, seqs)
122 nr_false_pos False positives. tuple of (hits, seqs)
123 nr_false_neg False negatives. (int)
124 nr_partial False negatives, because they are fragments. (int)
125
126 COMMENTS
127 cc_taxo_range Taxonomic range. See docs for format
128 cc_max_repeat Maximum number of repetitions in a protein
129 cc_site Interesting site. list of tuples (pattern pos, desc.)
130 cc_skip_flag Can this entry be ignored?
131 cc_matrix_type
132 cc_scaling_db
133 cc_author
134 cc_ft_key
135 cc_ft_desc
136 cc_version version number (introduced in release 19.0)
137
138 DATA BANK REFERENCES - The following are all
139 lists of tuples (swiss-prot accession,
140 swiss-prot name)
141 dr_positive
142 dr_false_neg
143 dr_false_pos
144 dr_potential Potential hits, but fingerprint region not yet available.
145 dr_unknown Could possibly belong
146
147 pdb_structs List of PDB entries.
148
149 """
151 self.name = ''
152 self.type = ''
153 self.accession = ''
154 self.created = ''
155 self.data_update = ''
156 self.info_update = ''
157 self.pdoc = ''
158
159 self.description = ''
160 self.pattern = ''
161 self.matrix = []
162 self.rules = []
163 self.prorules = []
164 self.postprocessing = []
165
166 self.nr_sp_release = ''
167 self.nr_sp_seqs = ''
168 self.nr_total = (None, None)
169 self.nr_positive = (None, None)
170 self.nr_unknown = (None, None)
171 self.nr_false_pos = (None, None)
172 self.nr_false_neg = None
173 self.nr_partial = None
174
175 self.cc_taxo_range = ''
176 self.cc_max_repeat = ''
177 self.cc_site = []
178 self.cc_skip_flag = ''
179
180 self.dr_positive = []
181 self.dr_false_neg = []
182 self.dr_false_pos = []
183 self.dr_potential = []
184 self.dr_unknown = []
185
186 self.pdb_structs = []
187
189 """Holds information from a hit against a Prosite pattern.
190
191 Members:
192 name ID of the record. e.g. ADH_ZINC
193 accession e.g. PS00387
194 pdoc ID of the PROSITE DOCumentation.
195 description Free-format description.
196 matches List of tuples (start, end, sequence) where
197 start and end are indexes of the match, and sequence is
198 the sequence matched.
199
200 """
208 lines = []
209 lines.append("%s %s %s" % (self.accession, self.pdoc, self.name))
210 lines.append(self.description)
211 lines.append('')
212 if len(self.matches) > 1:
213 lines.append("Number of matches: %s" % len(self.matches))
214 for i in range(len(self.matches)):
215 start, end, seq = self.matches[i]
216 range_str = "%d-%d" % (start, end)
217 if len(self.matches) > 1:
218 lines.append("%7d %10s %s" % (i+1, range_str, seq))
219 else:
220 lines.append("%7s %10s %s" % (' ', range_str, seq))
221 return "\n".join(lines)
222
223
225 """Accesses a Prosite file using a dictionary interface.
226
227 """
228 __filename_key = '__filename'
229
230 - def __init__(self, indexname, parser=None):
231 """__init__(self, indexname, parser=None)
232
233 Open a Prosite Dictionary. indexname is the name of the
234 index for the dictionary. The index should have been created
235 using the index_file function. parser is an optional Parser
236 object to change the results into another form. If set to None,
237 then the raw contents of the file will be returned.
238
239 """
240 self._index = Index.Index(indexname)
241 self._handle = open(self._index[Dictionary.__filename_key])
242 self._parser = parser
243
246
254
257
259 """Parses Prosite data into a Record object.
260
261 """
265
266 - def parse(self, handle):
267 self._scanner.feed(handle, self._consumer)
268 return self._consumer.data
269
271 """Scans Prosite-formatted data.
272
273 Tested with:
274 Release 15.0, July 1998
275
276 """
277 - def feed(self, handle, consumer):
278 """feed(self, handle, consumer)
279
280 Feed in Prosite data for scanning. handle is a file-like
281 object that contains prosite data. consumer is a
282 Consumer object that will receive events as the report is scanned.
283
284 """
285 if isinstance(handle, File.UndoHandle):
286 uhandle = handle
287 else:
288 uhandle = File.UndoHandle(handle)
289
290 consumer.finished = False
291 while not consumer.finished:
292 line = uhandle.peekline()
293 if not line:
294 break
295 elif is_blank_line(line):
296
297 uhandle.readline()
298 continue
299 elif line[:2] == 'ID':
300 self._scan_record(uhandle, consumer)
301 elif line[:2] == 'CC':
302 self._scan_copyrights(uhandle, consumer)
303 else:
304 raise ValueError("There doesn't appear to be a record")
305
307 consumer.start_copyrights()
308 self._scan_line('CC', uhandle, consumer.copyright, any_number=1)
309 self._scan_terminator(uhandle, consumer)
310 consumer.end_copyrights()
311
324
325 - def _scan_line(self, line_type, uhandle, event_fn,
326 exactly_one=None, one_or_more=None, any_number=None,
327 up_to_one=None):
345
348
351
354
357
360
363
364
365
366
367
368
369
370
371
372
373
374
375
376
380
383
387
390
394
398
402
405
408
409
410
411
412 _scan_fns = [
413 _scan_id,
414 _scan_ac,
415 _scan_dt,
416 _scan_de,
417 _scan_pa,
418 _scan_ma,
419 _scan_pp,
420 _scan_ru,
421 _scan_nr,
422 _scan_cc,
423
424
425
426
427
428 _scan_ma,
429 _scan_nr,
430 _scan_cc,
431
432 _scan_dr,
433 _scan_3d,
434 _scan_pr,
435 _scan_do,
436 _scan_terminator
437 ]
438
440 """Consumer that converts a Prosite record to a Record object.
441
442 Members:
443 data Record with Prosite data.
444
445 """
448
451
454
456 cols = line.split()
457 if len(cols) != 3:
458 raise ValueError("I don't understand identification line\n%s" \
459 % line)
460 self.data.name = self._chomp(cols[1])
461 self.data.type = self._chomp(cols[2])
462
464 cols = line.split()
465 if len(cols) != 2:
466 raise ValueError("I don't understand accession line\n%s" % line)
467 self.data.accession = self._chomp(cols[1])
468
469 - def date(self, line):
470 uprline = line.upper()
471 cols = uprline.split()
472
473
474 if cols[2] != '(CREATED);' or \
475 cols[4] != '(DATA' or cols[5] != 'UPDATE);' or \
476 cols[7][:4] != '(INF' or cols[8] != 'UPDATE).':
477 raise ValueError("I don't understand date line\n%s" % line)
478
479 self.data.created = cols[1]
480 self.data.data_update = cols[3]
481 self.data.info_update = cols[6]
482
485
488
491
492 - def postprocessing(self, line):
495
496 - def rule(self, line):
498
500 cols = self._clean(line).split(";")
501 for col in cols:
502 if not col:
503 continue
504 qual, data = [word.lstrip() for word in col.split("=")]
505 if qual == '/RELEASE':
506 release, seqs = data.split(",")
507 self.data.nr_sp_release = release
508 self.data.nr_sp_seqs = int(seqs)
509 elif qual == '/FALSE_NEG':
510 self.data.nr_false_neg = int(data)
511 elif qual == '/PARTIAL':
512 self.data.nr_partial = int(data)
513 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
514 m = re.match(r'(\d+)\((\d+)\)', data)
515 if not m:
516 raise Exception("Broken data %s in comment line\n%s" \
517 % (repr(data), line))
518 hits = tuple(map(int, m.groups()))
519 if(qual == "/TOTAL"):
520 self.data.nr_total = hits
521 elif(qual == "/POSITIVE"):
522 self.data.nr_positive = hits
523 elif(qual == "/UNKNOWN"):
524 self.data.nr_unknown = hits
525 elif(qual == "/FALSE_POS"):
526 self.data.nr_false_pos = hits
527 else:
528 raise ValueError("Unknown qual %s in comment line\n%s" \
529 % (repr(qual), line))
530
572
591
596
601
604
607
608 - def _chomp(self, word, to_chomp='.,;'):
609
610 if word[-1] in to_chomp:
611 return word[:-1]
612 return word
613
614 - def _clean(self, line, rstrip=1):
615
616 if rstrip:
617 return line[5:].rstrip()
618 return line[5:]
619
621 """scan_sequence_expasy(seq=None, id=None, exclude_frequent=None) ->
622 list of PatternHit's
623
624 Search a sequence for occurrences of Prosite patterns. You can
625 specify either a sequence in seq or a SwissProt/trEMBL ID or accession
626 in id. Only one of those should be given. If exclude_frequent
627 is true, then the patterns with the high probability of occurring
628 will be excluded.
629
630 """
631 from Bio import ExPASy
632 if (seq and id) or not (seq or id):
633 raise ValueError("Please specify either a sequence or an id")
634 handle = ExPASy.scanprosite1(seq, id, exclude_frequent)
635 return _extract_pattern_hits(handle)
636
638 """_extract_pattern_hits(handle) -> list of PatternHit's
639
640 Extract hits from a web page. Raises a ValueError if there
641 was an error in the query.
642
643 """
644 class parser(sgmllib.SGMLParser):
645 def __init__(self):
646 sgmllib.SGMLParser.__init__(self)
647 self.hits = []
648 self.broken_message = 'Some error occurred'
649 self._in_pre = 0
650 self._current_hit = None
651 self._last_found = None
652 def handle_data(self, data):
653 if data.find('try again') >= 0:
654 self.broken_message = data
655 return
656 elif data == 'illegal':
657 self.broken_message = 'Sequence contains illegal characters'
658 return
659 if not self._in_pre:
660 return
661 elif not data.strip():
662 return
663 if self._last_found is None and data[:4] == 'PDOC':
664 self._current_hit.pdoc = data
665 self._last_found = 'pdoc'
666 elif self._last_found == 'pdoc':
667 if data[:2] != 'PS':
668 raise ValueError("Expected accession but got:\n%s" % data)
669 self._current_hit.accession = data
670 self._last_found = 'accession'
671 elif self._last_found == 'accession':
672 self._current_hit.name = data
673 self._last_found = 'name'
674 elif self._last_found == 'name':
675 self._current_hit.description = data
676 self._last_found = 'description'
677 elif self._last_found == 'description':
678 m = re.findall(r'(\d+)-(\d+) (\w+)', data)
679 for start, end, seq in m:
680 self._current_hit.matches.append(
681 (int(start), int(end), seq))
682
683 def do_hr(self, attrs):
684
685 if self._in_pre:
686 self._current_hit = PatternHit()
687 self.hits.append(self._current_hit)
688 self._last_found = None
689 def start_pre(self, attrs):
690 self._in_pre = 1
691 self.broken_message = None
692 def end_pre(self):
693 self._in_pre = 0
694 p = parser()
695 p.feed(handle.read())
696 if p.broken_message:
697 raise ValueError(p.broken_message)
698 return p.hits
699
700
701
702
703 -def index_file(filename, indexname, rec2key=None):
704 """index_file(filename, indexname, rec2key=None)
705
706 Index a Prosite file. filename is the name of the file.
707 indexname is the name of the dictionary. rec2key is an
708 optional callback that takes a Record and generates a unique key
709 (e.g. the accession number) for the record. If not specified,
710 the id name will be used.
711
712 """
713 import os
714 if not os.path.exists(filename):
715 raise ValueError("%s does not exist" % filename)
716
717 index = Index.Index(indexname, truncate=1)
718 index[Dictionary._Dictionary__filename_key] = filename
719
720 handle = open(filename)
721 records = parse(handle)
722 end = 0L
723 for record in records:
724 start = end
725 end = long(handle.tell())
726 length = end - start
727
728 if rec2key is not None:
729 key = rec2key(record)
730 else:
731 key = record.name
732
733 if not key:
734 raise KeyError("empty key was produced")
735 elif key in index:
736 raise KeyError("duplicate key %s found" % key)
737
738 index[key] = start, length
739