1 import sgmllib
2 import Bio.File
3
4 """
5 The LocusLink site is:
6 http://www.ncbi.nlm.nih.gov/LocusLink/
7 Parses a Locus web page.
8 """
9
10 import warnings
11 warnings.warn("Bio.LocusLink was deprecated, as NCBI's LocusLink was superceded by Entrez Gene. If you still need this module, please get in touch with the Biopython developers (biopython-dev@biopython.org) to avoid permanent removal of this module", DeprecationWarning)
12
13
15 response = 0
16 if is_container( item ):
17 if len( item ) == 0:
18 response = 1
19 return response
20
22 response = 0
23 if type( item ) in [ type( [] ), type( {} ) ]:
24 response = 1
25 return response
26
28 if( a.find( b ) < 0 ):
29 return 0
30 else:
31 return 1
32
34 print '%s!!!!!\n' % 'PARAMS'
35 for item in params:
36
37 print 'param ' + str( item )
38 print '-----------------'
39
54
55 -def put( dict, key, val ):
61
62
95
99
101 if not isinstance( other, self.__class__ ):
102 return 0
103 if self.token == other.token:
104 return 1
105 return 0
106
108 if not isinstance( other, Token ):
109 return 1
110 if self.token != other.token:
111 return 1
112 return 0
113
115 output = 'token_%s\n' % self.token
116 return output
117
118
119 open_list = Token( 'open_list' )
120 close_list = Token( 'close_list' )
121 open_dict = Token( 'open_dict' )
122 close_dict = Token( 'close_dict' )
123
131
139
142
144
145 - def __init__( self, url, label = '', description = '' ):
149
151 output = '%s\n' % self.label
152 output = output + 'url = %s\n' % self.url
153 output = output + '%s\n' % self.description
154 return output
155
156
158
161
163 queue_keys = self.keys()
164 queue_keys.sort()
165 out = ''
166 for key in queue_keys:
167 out = out + '%s:\n' % key.upper()
168 out = out + self.print_item( self[ key ] )
169 out = out + '\n'
170
171 return out
172
174 indent = ' '
175 out = ''
176 for j in range( 0, level ):
177 indent = indent + ' '
178 if( type( item ) == type( '' ) ):
179 if( item != '' ):
180 out = out + '%s%s\n' % ( indent, item )
181 elif( type( item ) == type([])):
182 for subitem in item:
183 out = out + self.print_item( subitem, level + 1 )
184 out = out + '----------------------------------------------\n'
185 elif( type( item ) == type ( {} ) ):
186 keys = item.keys()
187 keys.sort()
188 for subitem in keys:
189 out = out + '%skey is %s\n' % ( indent, subitem )
190 out = out + self.print_item( item[ subitem ], level + 1 )
191 elif( isinstance( item, dict ) ):
192 keys = item.keys()
193 keys.sort()
194 for subitem in keys:
195 out = out + '%skey is %s\n' % ( indent, subitem )
196 out = out + self.print_item( item[ subitem ], level + 1 )
197 else:
198 out = out + '%s%s\n' % ( indent, str( item ) )
199 return out
200
201
203
205 sgmllib.SGMLParser.reset( self )
206 self.text = ''
207 self.record = Record()
208 self.open_tag_stack = []
209 self.open_tag = 'open_html'
210 self.outer_state = 'undefined'
211 self.section_state = 'undefined'
212 self.local_title = ''
213 self.structure_stack = []
214 self.category = ''
215 self.context_chain = []
216 self.outer_state_dict = { 'nomenclature' : 'nomenclature', 'overview' : 'overview', \
217 'function' : 'function', \
218 'relationships' : 'relationships', \
219 'locus' : 'locus', \
220 'map' : 'map', \
221 'refseq' : 'refseq', \
222 'genbank' : 'genbank', \
223 'external' : 'external_annotation', \
224 'additional' : 'additional_links' \
225 }
226
227
228 - def parse( self, handle ):
232
233
234
235
236 - def feed( self, handle ):
249
250 - def get_text( self ):
251 text = self.text
252 self.text = ''
253 return text
254
296
297
299 newtext = newtext.strip()
300 self.text = self.text + newtext
301
303 self.open_tag_stack.append( self.open_tag )
304 self.open_tag = 'open_a'
305 attr_dict = {}
306 for key, val in attrs:
307 attr_dict[ key ] = val
308 outer_state = self.outer_state
309 if( outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'genbank', 'refseq', 'additional', 'external' ] ):
310 if self.section_state == 'local_contents':
311 if self.detail_state in [ 'scan_val', 'unpaired_key' ]:
312 if attr_dict.has_key( 'href' ):
313 href = attr_dict[ 'href' ]
314 self.text = ''
315 self.structure_stack.append( Url( href, '' ) )
316 elif outer_state == 'function':
317 if self.section_state == 'local_contents':
318 if self.detail_state in [ 'scan_val', 'unpaired_key', 'may_be_val' ]:
319 if attr_dict.has_key( 'href' ):
320 href = attr_dict[ 'href' ]
321 self.text = ''
322 self.structure_stack.append( Url( href, '' ) )
323
324
326 try:
327 self.open_tag = self.open_tag_stack.pop()
328 except:
329 self.open_tag = 'open_html'
330 outer_state = self.outer_state
331 if( outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ):
332 if self.section_state == 'local_contents':
333 if self.detail_state in [ 'scan_val', 'unpaired_key' ]:
334 text = self.get_text()
335 url = self.structure_stack.pop()
336 if isinstance( url, Url ):
337 url.label = text
338 self.structure_stack.append( url )
339
340 elif outer_state == 'function':
341 if self.section_state == 'local_contents':
342 if self.detail_state in [ 'scan_val', 'unpaired_key',
343 'may_be_val' ]:
344 text = self.get_text()
345 url = self.structure_stack.pop()
346 if isinstance( url, Url ):
347 url.label = text
348 self.structure_stack.append( url )
349
351
352 self.open_tag_stack.append( self.open_tag )
353 self.open_tag = 'open_b'
354 outer_state = self.outer_state
355 if( outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ):
356 self.text = ''
357
358
359
361 try:
362 self.open_tag = self.open_tag_stack.pop()
363 except:
364 self.open_tag = 'open_html'
365 outer_state = self.outer_state
366 if( outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ):
367 if self.section_state == 'local_contents':
368 text = self.get_text()
369 cols = text.split( ':', 1 )
370 key = cols[ 0 ]
371 if( outer_state == 'refseq' ):
372 self.structure_stack.append( cols[ 1 ] )
373 self.structure_stack.append( open_dict )
374 self.detail_state = 'waiting_key'
375 elif outer_state == 'relationships':
376 self.structure_stack.append( key )
377 self.structure_stack.append( open_list )
378 self.detail_state = 'skip'
379 elif outer_state == 'additional':
380 self.structure_stack.append( open_dict )
381 self.structure_stack.append( key )
382 self.structure_stack.append( open_list )
383 self.detail_state = 'unpaired_key'
384 elif outer_state == 'function':
385 if self.detail_state != 'waiting_key':
386 self.structure_stack.append( close_list )
387 self.structure_stack.append( key )
388 self.detail_state = 'unpaired_key'
389 self.structure_stack.append( open_list )
390 self.structure_stack.append( open_list )
391 try:
392 val = cols[ 1 ]
393 if val.strip() != '':
394 self.structure_stack.append( val )
395 self.detail_state = 'unpaired_key'
396
397 except IndexError:
398 pass
399 else:
400 if self.detail_state != 'waiting_key':
401 self.structure_stack.append( close_list )
402 self.detail_state = 'scan_val'
403 self.structure_stack.append( key )
404 self.structure_stack.append( open_list )
405 self.structure_stack.append( open_list )
406 try:
407 val = cols[ 1 ]
408 if val.strip() != '':
409 self.structure_stack.append( val )
410 except IndexError:
411 pass
412
413
415
416 self.open_tag_stack.append( self.open_tag )
417 self.open_tag = 'open_th'
418 outer_state = self.outer_state
419 self.text = ''
420 if outer_state in [ 'function', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]:
421 if self.section_state == 'local_contents':
422 self.detail_state = 'scan_headings'
423
424
425
427 try:
428 self.open_tag = self.open_tag_stack.pop()
429 except:
430 self.open_tag = 'open_html'
431 outer_state = self.outer_state
432 if outer_state == 'refseq':
433 if self.section_state == 'local_contents':
434 text = self.get_text()
435 cols = text.strip().split( ':', 1 )
436 if text.strip().lower().startswith( 'category' ):
437 self.structure_stack.append( open_dict )
438 self.structure_stack.append( cols[ 1 ] )
439 self.structure_stack.append( open_list )
440 self.structure_stack.append( open_dict )
441 self.detail_state = 'found_category'
442
443 elif self.detail_state in [ 'found_category', 'may_be_val' ]:
444 if text.strip() != '':
445 if self.detail_state != 'found_category':
446 self.structure_stack.append( close_list )
447 cols = text.split( ':' )
448 self.structure_stack.append( cols[ 0 ] )
449 self.structure_stack.append( open_list )
450 try:
451 val = cols[ 1 ]
452 self.structure_stack.append( open_list )
453 self.structure_stack.append( val )
454 self.detail_state = 'scan_val'
455 except IndexError:
456 self.detail_state = 'may_be_val'
457
458
459
460
461
463 self.open_tag_stack.append( self.open_tag )
464 self.open_tag = 'open_table'
465 self.text = ''
466 if self.outer_state == 'genbank':
467 if self.section_state == 'local_contents':
468 self.detail_state = 'skip'
469 elif( self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'genbank', 'additional', 'external' ] ):
470
471 if self.section_state == 'local_contents':
472 self.detail_state = 'waiting_key'
473
475 try:
476 self.open_tag = self.open_tag_stack.pop()
477 except:
478 self.open_tag = 'open_html'
479 if( self.section_state == 'local_title' ):
480 if self.outer_state == 'refseq':
481 self.section_state = 'local_contents'
482 elif self.outer_state == 'additional':
483 self.section_state = 'local_contents'
484 self.detail_state = 'scan_val'
485 else:
486 self.section_state = 'local_contents'
487 self.detail_state = 'waiting_key'
488 elif self.section_state == 'local_contents':
489 if( self.outer_state in [ 'nomenclature', 'relationships', 'locus', 'map', 'external' ] ):
490 self.structure_stack.append( close_list )
491 elif ( self.outer_state in [ 'genbank', 'additional' ] ):
492 if self.detail_state == 'scan_val':
493 self.structure_stack.append( close_list )
494
495 elif self.outer_state == 'refseq':
496 if self.detail_state in ['may_be_val', 'scan_val' ]:
497 self.structure_stack.append( close_list )
498 self.structure_stack.append( close_dict )
499 self.structure_stack.append( close_list )
500 self.structure_stack.append( close_dict )
501 self.detail_state = 'scan_category'
502
503
505 top = self.open_tag
506 self.open_tag_stack.append( self.open_tag )
507 if top == 'open_table_row':
508 if self.outer_state == 'refseq':
509 if self.section_state == 'local_contents':
510 if self.detail_state in [ 'scan_val', ]:
511 self.structure_stack.append( close_list )
512 self.detail_state = 'may_be_val'
513 self.open_tag_stack.pop()
514 self.open_tag = 'open_table_row'
515 self.text = ''
516 outer_state = self.outer_state
517 if( outer_state in [ 'relationships', 'locus', 'function', 'genbank', 'external'
518 ] ):
519 if self.section_state == 'local_contents':
520 if self.detail_state == 'scan_val':
521 self.structure_stack.append( open_list )
522 elif outer_state == 'map':
523 if self.section_state == 'local_contents':
524 if self.detail_state == 'scan_val':
525 self.structure_stack.append( open_list )
526
527 elif outer_state == 'additional':
528 if self.section_state == 'local_contents':
529 self.detail_state = 'scan_val'
530 self.structure_stack.append( open_list )
531
532
534 try:
535 self.open_tag = self.open_tag_stack.pop()
536 except:
537 self.open_tag = 'open_html'
538 if self.section_state == 'local_contents':
539 if( self.outer_state in [ 'overview', 'nomenclature', 'relationships',
540 'locus', 'genbank', 'external' ] ):
541 if self.detail_state == 'scan_val':
542 self.structure_stack.append( close_list )
543 elif self.detail_state == 'unpaired_key':
544 self.structure_stack.append( close_list )
545 elif self.detail_state == 'skip':
546 self.detail_state = 'scan_val'
547 elif self.detail_state == 'scan_headings':
548 self.detail_state = 'scan_val'
549 elif self.outer_state in [ 'additional', ]:
550 if self.detail_state == 'unpaired_key':
551 self.structure_stack.append( close_list )
552 self.structure_stack.append( close_dict )
553 self.structure_stack.append( close_list )
554 elif self.detail_state == 'scan_val':
555 self.structure_stack.append( close_list )
556 elif self.outer_state in [ 'function', ]:
557 if self.detail_state == 'scan_headings':
558 self.detail_state = 'scan_val'
559 elif self.detail_state == 'unpaired_key':
560 self.detail_state = 'may_be_val'
561 self.structure_stack.append( close_list )
562 elif self.detail_state == 'scan_val':
563 self.detail_state = 'may_be_val'
564 self.structure_stack.append( close_list )
565 elif self.outer_state in [ 'refseq', ]:
566 if self.section_state == 'local_contents':
567 if self.detail_state == 'scan_val':
568 self.structure_stack.append( close_list )
569 self.detail_state = 'may_be_val'
570 elif self.outer_state == 'map':
571 if self.section_state == 'local_contents':
572 if self.detail_state == 'scan_val':
573 self.structure_stack.append( close_list )
574 self.detail_state = 'may_be_val'
575
576
578 self.open_tag_stack.append( self.open_tag )
579 self.open_tag = 'open_table_data'
580 if self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]:
581 if( self.section_state == 'local_contents' ):
582 self.text = ''
583 elif self.outer_state == 'refseq':
584 if self.section_state == 'local_contents':
585 self.text = ''
586 if self.detail_state == 'may_be_val':
587 self.structure_stack.append( open_list )
588 self.detail_state = 'scan_val'
589
591 try:
592 self.open_tag = self.open_tag_stack.pop()
593 except:
594 self.open_tag = 'open_html'
595
596
597
598 if self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'genbank', 'additional', 'external' ]:
599 if( self.section_state == 'local_contents' ):
600 if self.detail_state == 'scan_val':
601 text = self.get_text()
602 if( text != '' ):
603 self.structure_stack.append( text )
604 elif self.outer_state == 'function':
605 if self.section_state == 'local_contents':
606 text = self.get_text()
607 if( text != '' ):
608 if self.detail_state == 'may_be_val':
609 if text.strip() != '':
610 self.structure_stack.append( open_list )
611 self.detail_state = 'scan_val'
612 if self.detail_state in [ 'unpaired_key', 'scan_val' ]:
613 self.structure_stack.append( text )
614 elif self.outer_state == 'map':
615 if self.section_state == 'local_contents':
616 text = self.get_text()
617 if( text != '' ):
618 if self.detail_state == 'may_be_val':
619 if text.strip() != '':
620 self.structure_stack.append( open_list )
621 self.detail_state = 'scan_val'
622 if self.detail_state == 'scan_val':
623 self.structure_stack.append( text )
624 elif self.outer_state == 'refseq':
625 if self.section_state == 'local_contents':
626 if self.detail_state == 'scan_val':
627 text = self.get_text()
628 if( text != '' ):
629 self.add_text_to_object( text )
630
631 - def do_br( self, attrs ):
632 if self.outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]:
633 if( self.section_state == 'local_contents' ):
634 if self.detail_state == 'scan_val':
635 if self.is_contained_by( 'open_table_data' ):
636 text = self.get_text()
637 if( text != '' ):
638 self.structure_stack.append( text )
639
640
641 - def add_text_to_object( self, text ):
642 stack_item = self.structure_stack.pop()
643 if isinstance( stack_item, Url ):
644 if stack_item.description == '':
645 stack_item.description = text
646 self.structure_stack.append( stack_item )
647 else:
648 self.structure_stack.append( stack_item )
649 self.structure_stack.append( text )
650
651
652
654 return tag in self.open_tag_stack
655
657 params = []
658 outer_state = self.outer_state
659 if outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'refseq', 'locus', 'map', 'genbank', 'additional', 'external' ]:
660 while len( self.structure_stack ) > 1:
661 len_stack = len( self.structure_stack )
662
663 for i in range ( 0, len_stack ):
664 item = self.structure_stack.pop()
665 if not is_open_token( item ):
666 params.append( item )
667 else: break
668 if( open_list.__eq__( item ) ):
669 container = process_list( params )
670 params.append( container )
671 else:
672 container = process_dict( params )
673 if len( container ) > 0:
674 params.append( container )
675 if ( len( self.structure_stack ) == 0 ) or is_open_token(
676 self.structure_stack[ -1 ] ):
677 for j in range( 0, len( params ) ):
678 item = params.pop()
679 self.structure_stack.append( item )
680 params = []
681
682
684 print '%s!!!!!\n' % self.outer_state.upper()
685 for stack_item in self.structure_stack:
686 print 'stack has ' + str( stack_item )
687 print '-----------------'
688
689
690
691
692 if( __name__ == '__main__' ):
693 handle = open( 'Hs13225.htm')
694 undo_handle = Bio.File.UndoHandle( handle )
695 locuslink_parser = LocusLinkParser()
696 record = locuslink_parser.parse( handle )
697 print record
698