1
2
3
4
5
6 """
7 This module provided code to parse HTML files from NDB (DEPRECATED).
8
9 This module provides an HTML parser designed for the NDB website
10 http://ndbserver.rutgers.edu/ as it was circa 2002. The site has since
11 been redesigned, breaking the parser. Bio.Ndb is therefore deprecated,
12 and will be removed in a future release of Biopython.
13
14 Classes:
15 Record Holds NDB sequence data.
16 NdbParser Parses NDB sequence data into a Record object.
17
18 The algorithm is based on a state machine because the record has multiple
19 sections and the handling of tags varies depending on the section.
20 Citations have their own state machine.
21 """
22 import warnings
23 warnings.warn("Bio.Ndb has been deprecated as the NDB website it used to"\
24 " parse has been redesigned.", DeprecationWarning)
25
26 from types import *
27 from Bio import File
28 from Bio import Index
29 from Bio.Crystal import Hetero
30 from Bio.Crystal import Chain
31 from Bio.Crystal import Crystal
32 from Bio.SeqFeature import Reference
33 import urllib
34 import sgmllib
35 from Bio.ParserSupport import *
36 from Bio.SeqFeature import Reference
37
38
40
42 self[ 'Id' ] = ''
43 self[ 'Features' ] = ''
44 self[ 'Name' ] = ''
45 self[ 'Sequence' ] = Crystal( {} )
46 self[ 'Citation' ] = Reference()
47 self[ 'Space Group' ] = ''
48 self[ 'Cell Constants' ] = {}
49 self[ 'Crystallization Conditions' ] = []
50 self[ 'Refinement' ] = ''
51 self[ 'Coordinates' ] = ''
52
54 keys = self.keys()
55 keys.sort()
56 out = ''
57 for key in keys:
58 val = self[ key ]
59 if( type( val ) == type( [] ) ):
60 out = out + '\n%s\n' % key
61 for item in val:
62 out = out + '%s\n' % item
63
64 elif( type( val ) == type( {} ) ):
65 out = out + '\n%s\n' % key
66 subkeys = val.keys()
67 subkeys.sort()
68 for item in subkeys:
69 out = out + '%s : %s\n' % ( item, val[ item ] )
70 elif( isinstance( val, dict ) ):
71 out = out + '\n%s\n' % key
72 subkeys = val.keys()
73 subkeys.sort()
74 for item in subkeys:
75 out = out + '%s : %s\n' % ( item, val[ item ] )
76
77 else:
78 out = out + '%s: %s\n' % ( key, self[ key ] )
79 return out
80
99
100
101
102
103
105 """Parses Ndb sequence data into a Record object.
106 data available at: http://ndbserver.rutgers.edu/NDB/NDBATLAS/index.html
107 """
109 sgmllib.SGMLParser.reset( self )
110 self.ndb_dict = Record()
111 self.text = ''
112 self._space_group = ''
113 self._state = 'id'
114 self._reference_state = 'authors'
115 self._current_reference = Reference()
116
117 - def parse(self, handle):
121
122 - def feed(self, handle):
123 """feed(self, handle )
124
125 Feed in ndb data for scanning. handle is a file-like object
126 containing ndb data. consumer is a Consumer object that will
127 receive events as the ndb data is scanned.
128
129 """
130 if isinstance(handle, File.UndoHandle):
131 uhandle = handle
132 else:
133 uhandle = File.UndoHandle(handle)
134 text = ''
135 while 1:
136 line = uhandle.readline()
137 if( not line ):
138 break
139 line = line.strip()
140 if( line[ -7: ] == '</HTML>' ):
141 break
142 text = text + ' ' + line
143
144 sgmllib.SGMLParser.feed( self, text )
145
146
148 newtext = newtext.strip()
149 self.text = self.text + newtext
150
153
155 text = self._flush_text()
156 if( self._state == 'id' ):
157 cols = text.split( ':' )
158 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper()
159 self._state = 'id_found'
160
162 text = self._flush_text()
163 if( self._state == 'features' ):
164 self.ndb_dict[ 'Features' ] = text
165 elif( self._state == 'name' ):
166 self.ndb_dict[ 'Name' ] = text
167 elif( self._state == 'sequence' ):
168 pass
169 elif( self._state == 'citation' ):
170 if( self._reference_state == 'journal' ):
171 self._current_reference.journal = text
172 self.ndb_dict[ 'Citation' ] = self._current_reference
173 elif( self._state == 'space' ):
174 self._space_group = self._space_group + text
175 self.ndb_dict[ 'Space Group' ] = self._space_group
176 elif( self._state == 'constants' ):
177 self.ndb_dict[ 'Cell Constants' ] = _parse_constants( text )
178 elif( self._state == 'crystallization' ):
179 pass
180 elif( self._state == 'refinement' ):
181 self.ndb_dict[ 'Refinement' ] = text
182 elif( self._state == 'coordinates' ):
183 self.ndb_dict[ 'Coordinates' ] = text
184
186 text = self._flush_text()
187 text = text.lower()
188 if( self._state == 'id' ):
189 if( text.find( 'id' ) >= 0 ):
190 cols = text.split( ':' )
191 self.ndb_dict[ 'Id' ] = ( cols[ 1 ] ).upper()
192 self._state = 'id_found'
193 elif( text.find( 'feature' ) >= 0 ):
194 self._state = 'features'
195 elif( text.find( 'name' ) >= 0 ):
196 self._state = 'name'
197 elif( text.find( 'sequence' ) >= 0 ):
198 self._state = 'sequence'
199 elif( text.find( 'citation' ) >= 0 ):
200 self._state = 'citation'
201 elif( text.find( 'space' ) >= 0 ):
202 self._state = 'space'
203 elif( text.find( 'constants' ) >= 0 ):
204 self._state = 'constants'
205 elif( text.find( 'crystallization' ) >= 0 ):
206 self._state = 'crystallization'
207 elif( text.find( 'refinement' ) >= 0 ):
208 self._state = 'refinement'
209 elif( text.find( 'coordinates' ) >= 0 ):
210 self._state = 'coordinates'
211
212
214 if( self._state == 'sequence' ):
215 self._flush_text()
216
217 elif( self._state == 'crystallization' ):
218 self._flush_text()
219
221 if( self._state == 'sequence' ):
222 self._parse_chain()
223 elif( self._state == 'crystallization' ):
224 text = self._flush_text()
225 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
226 elif( self._state == 'citation' ):
227 if( self._reference_state == 'journal' ):
228 self._current_reference.journal = self._flush_text()
229 self._reference_state = 'done'
230
232 if( self._state == 'space' ):
233 self._space_group = self._space_group + self._flush_text()
234
236 if( self._state == 'space' ):
237 self._space_group = self._space_group + '(%s) ' % self._flush_text()
238
240 if( self._state == 'sequence' ):
241 self._parse_chain()
242 elif( self._state == 'crystallization' ):
243 text = self._flush_text()
244 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
245
247 if( self._state == 'sequence' ):
248 self._parse_chain()
249 elif( self._state == 'crystallization' ):
250 text = self._flush_text()
251 ( self.ndb_dict[ 'Crystallization Conditions' ] ).append( text )
252
253 - def do_br( self, attrs ):
254 if( self._state == 'citation' ):
255 if( self._reference_state == 'authors' ):
256 self._current_reference.authors = self._flush_text()
257 self._reference_state = 'title'
258 elif( self._reference_state == 'title' ):
259 self._current_reference.title = self._flush_text()
260 self._reference_state = 'journal'
261
264
266 if( self._state == 'references' ):
267 if( self._reference_state == 'title' ):
268 text = self._flush_text()
269 self._current_reference.title = text
270 self._reference_state = 'journal'
271
272
282
283
284
285 - def _flush_text( self ):
286 text = self.text.strip()
287 self.text = ''
288 return text[:]
289
290
291 if( __name__ == '__main__' ):
292 handle = open( 'PR0004.htm')
293 undo_handle = File.UndoHandle( handle )
294 ndb_parser = NdbParser()
295 record = ndb_parser.parse( handle )
296 print str( record )
297