Package Bio :: Package LocusLink :: Module web_parse
[hide private]
[frames] | no frames]

Source Code for Module Bio.LocusLink.web_parse

  1  import sgmllib 
  2  import Bio.File 
  3   
  4  """ 
  5  The LocusLink site is: 
  6  http://www.ncbi.nlm.nih.gov/LocusLink/ 
  7  Parses a Locus web page. 
  8  """ 
  9   
 10  import warnings 
 11  warnings.warn("Bio.LocusLink was deprecated, as NCBI's LocusLink was superceded by Entrez Gene. If you still need this module, please get in touch with the Biopython developers (biopython-dev@biopython.org) to avoid permanent removal of this module", DeprecationWarning) 
 12   
 13   
14 -def is_empty_container( item ):
15 response = 0 16 if is_container( item ): 17 if len( item ) == 0: 18 response = 1 19 return response
20
21 -def is_container( item ):
22 response = 0 23 if type( item ) in [ type( [] ), type( {} ) ]: 24 response = 1 25 return response
26
27 -def is_substring( a, b ):
28 if( a.find( b ) < 0 ): 29 return 0 30 else: 31 return 1
32 39
40 -def process_list( params ):
41 len_params = len( params ) 42 container = [] 43 while 1: 44 try: 45 element = params.pop() 46 except: 47 break 48 if is_close_token( element ): break 49 elif is_open_token( element ): 50 break 51 else: 52 container.append( element ) 53 return container
54
55 -def put( dict, key, val ):
56 if dict.has_key( key ): 57 element = dict[ key ] 58 dict[ key ] = [ element, val ] 59 else: 60 dict[ key ] = val
61 62
63 -def process_dict( params ):
64 container = {} 65 while len( params ) > 0: 66 element = params.pop() 67 if type( element ) == type( {} ): 68 for key, val in element.items(): 69 put( container, key, val ) 70 elif is_close_token( element ): break 71 elif is_open_token( element ): 72 params.append( element ) 73 else: 74 val = params.pop() 75 if type( val ) == type( [] ): 76 if len( val ) == 1: 77 val = val[ 0 ] 78 try: 79 put( container, element, val ) 80 except: 81 print 'Element' 82 print element 83 params.append( element ) 84 85 elif( not is_close_token( val ) ): 86 try: 87 put( container, element, val ) 88 except: 89 print 'Element' 90 print element 91 params.append( element ) 92 else: 93 break 94 return container
95
96 -class Token:
97 - def __init__( self, token ):
98 self.token = token
99
100 - def __eq__( self, other ):
101 if not isinstance( other, self.__class__ ): 102 return 0 103 if self.token == other.token: 104 return 1 105 return 0
106
107 - def __ne__( self, other ):
108 if not isinstance( other, Token ): 109 return 1 110 if self.token != other.token: 111 return 1 112 return 0
113
114 - def __str__( self ):
115 output = 'token_%s\n' % self.token 116 return output
117 118 119 open_list = Token( 'open_list' ) 120 close_list = Token( 'close_list' ) 121 open_dict = Token( 'open_dict' ) 122 close_dict = Token( 'close_dict' ) 123
124 -def is_open_token( target ):
125 answer = 0 126 if isinstance( target, Token ): 127 if ( open_list.__eq__( target ) ) or ( open_dict.__eq__( 128 target ) ): 129 answer = 1 130 return answer
131
132 -def is_close_token( target ):
133 answer = 0 134 if isinstance( target, Token ): 135 if ( close_list.__eq__( target ) ) or ( close_dict.__eq__( 136 target ) ): 137 answer = 1 138 return answer
139
140 -def is_token( target ):
141 return is_open_token( target ) or is_close_token( target )
142
143 -class Url:
144
145 - def __init__( self, url, label = '', description = '' ):
146 self.url = url 147 self.label = label 148 self.description = description
149
150 - def __str__( self ):
151 output = '%s\n' % self.label 152 output = output + 'url = %s\n' % self.url 153 output = output + '%s\n' % self.description 154 return output
155 156
157 -class Record(dict):
158
159 - def __init__( self ):
160 dict.__init__( self )
161
162 - def __str__( self ):
163 queue_keys = self.keys() 164 queue_keys.sort() 165 out = '' 166 for key in queue_keys: 167 out = out + '%s:\n' % key.upper() 168 out = out + self.print_item( self[ key ] ) 169 out = out + '\n' 170 171 return out
172
173 - def print_item( self, item, level = 1 ):
174 indent = ' ' 175 out = '' 176 for j in range( 0, level ): 177 indent = indent + ' ' 178 if( type( item ) == type( '' ) ): 179 if( item != '' ): 180 out = out + '%s%s\n' % ( indent, item ) 181 elif( type( item ) == type([])): 182 for subitem in item: 183 out = out + self.print_item( subitem, level + 1 ) 184 out = out + '----------------------------------------------\n' 185 elif( type( item ) == type ( {} ) ): 186 keys = item.keys() 187 keys.sort() 188 for subitem in keys: 189 out = out + '%skey is %s\n' % ( indent, subitem ) 190 out = out + self.print_item( item[ subitem ], level + 1 ) 191 elif( isinstance( item, dict ) ): 192 keys = item.keys() 193 keys.sort() 194 for subitem in keys: 195 out = out + '%skey is %s\n' % ( indent, subitem ) 196 out = out + self.print_item( item[ subitem ], level + 1 ) 197 else: 198 out = out + '%s%s\n' % ( indent, str( item ) ) 199 return out
200 201
202 -class LocusLinkParser( sgmllib.SGMLParser ):
203
204 - def reset( self ):
205 sgmllib.SGMLParser.reset( self ) 206 self.text = '' 207 self.record = Record() 208 self.open_tag_stack = [] 209 self.open_tag = 'open_html' 210 self.outer_state = 'undefined' 211 self.section_state = 'undefined' 212 self.local_title = '' 213 self.structure_stack = [] 214 self.category = '' 215 self.context_chain = [] 216 self.outer_state_dict = { 'nomenclature' : 'nomenclature', 'overview' : 'overview', \ 217 'function' : 'function', \ 218 'relationships' : 'relationships', \ 219 'locus' : 'locus', \ 220 'map' : 'map', \ 221 'refseq' : 'refseq', \ 222 'genbank' : 'genbank', \ 223 'external' : 'external_annotation', \ 224 'additional' : 'additional_links' \ 225 }
226 227
228 - def parse( self, handle ):
229 self.reset() 230 self.feed( handle ) 231 return self.record
232 233 # 234 # Assumes an empty line between records 235 #
236 - def feed( self, handle ):
237 if isinstance(handle, Bio.File.UndoHandle): 238 uhandle = handle 239 else: 240 uhandle = Bio.File.UndoHandle(handle) 241 text = '' 242 while 1: 243 line = uhandle.readline() 244 if not line: 245 break 246 text = text + ' ' + line 247 248 sgmllib.SGMLParser.feed( self, text )
249
250 - def get_text( self ):
251 text = self.text 252 self.text = '' 253 return text
254
255 - def handle_comment( self, comment ):
256 while comment.startswith( '-' ): 257 comment = comment[ 1: ] 258 comment = comment.strip() 259 comment = comment.lower() 260 261 keys = self.outer_state_dict.keys() 262 for key in keys: 263 if comment.startswith( key ): 264 if key in [ 'nomenclature', 'overview', 'function', 265 'relationships', 'map', 'locus', 'external' ]: 266 self.structure_stack.append( open_dict ) 267 elif key in [ 'genbank', 'additional' ]: 268 self.structure_stack.append( open_list ) 269 elif key in [ 'refseq' ]: 270 self.structure_stack.append( open_list ) 271 self.outer_state = key 272 self.section_state = 'local_title' 273 self.detail_state = 'undefined' 274 if( key == 'refseq' ): 275 self.detail_state = 'waiting_category' 276 else: 277 self.detail_state = 'waiting_key' 278 break 279 if comment.startswith( 'end' ): 280 if is_substring( comment.lower(), self.outer_state ): 281 if self.outer_state == 'refseq': 282 self.structure_stack.append( close_list ) 283 elif self.outer_state == 'function': 284 self.structure_stack.append( close_list ) 285 self.structure_stack.append( close_dict ) 286 self.process_structure_stack() 287 while 1: 288 try: 289 item = self.structure_stack.pop() 290 except: 291 item = 'Not Available' 292 if not is_token( item ) : break 293 key = self.outer_state 294 self.record[ self.outer_state_dict[ key ] ] = item 295 self.outer_state = 'undefined'
296 297
298 - def handle_data(self, newtext ):
299 newtext = newtext.strip() 300 self.text = self.text + newtext
301
302 - def start_a( self, attrs ):
303 self.open_tag_stack.append( self.open_tag ) 304 self.open_tag = 'open_a' 305 attr_dict = {} 306 for key, val in attrs: 307 attr_dict[ key ] = val 308 outer_state = self.outer_state 309 if( outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'genbank', 'refseq', 'additional', 'external' ] ): 310 if self.section_state == 'local_contents': 311 if self.detail_state in [ 'scan_val', 'unpaired_key' ]: 312 if attr_dict.has_key( 'href' ): 313 href = attr_dict[ 'href' ] 314 self.text = '' 315 self.structure_stack.append( Url( href, '' ) ) 316 elif outer_state == 'function': 317 if self.section_state == 'local_contents': 318 if self.detail_state in [ 'scan_val', 'unpaired_key', 'may_be_val' ]: 319 if attr_dict.has_key( 'href' ): 320 href = attr_dict[ 'href' ] 321 self.text = '' 322 self.structure_stack.append( Url( href, '' ) )
323 324
325 - def end_a( self ):
326 try: 327 self.open_tag = self.open_tag_stack.pop() 328 except: 329 self.open_tag = 'open_html' 330 outer_state = self.outer_state 331 if( outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ): 332 if self.section_state == 'local_contents': 333 if self.detail_state in [ 'scan_val', 'unpaired_key' ]: 334 text = self.get_text() 335 url = self.structure_stack.pop() 336 if isinstance( url, Url ): 337 url.label = text 338 self.structure_stack.append( url ) 339 340 elif outer_state == 'function': 341 if self.section_state == 'local_contents': 342 if self.detail_state in [ 'scan_val', 'unpaired_key', 343 'may_be_val' ]: 344 text = self.get_text() 345 url = self.structure_stack.pop() 346 if isinstance( url, Url ): 347 url.label = text 348 self.structure_stack.append( url )
349
350 - def start_b( self, attrs ):
351 352 self.open_tag_stack.append( self.open_tag ) 353 self.open_tag = 'open_b' 354 outer_state = self.outer_state 355 if( outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ): 356 self.text = ''
357 358 359
360 - def end_b( self ):
361 try: 362 self.open_tag = self.open_tag_stack.pop() 363 except: 364 self.open_tag = 'open_html' 365 outer_state = self.outer_state 366 if( outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ): 367 if self.section_state == 'local_contents': 368 text = self.get_text() 369 cols = text.split( ':', 1 ) 370 key = cols[ 0 ] 371 if( outer_state == 'refseq' ): 372 self.structure_stack.append( cols[ 1 ] ) 373 self.structure_stack.append( open_dict ) 374 self.detail_state = 'waiting_key' 375 elif outer_state == 'relationships': 376 self.structure_stack.append( key ) 377 self.structure_stack.append( open_list ) 378 self.detail_state = 'skip' 379 elif outer_state == 'additional': 380 self.structure_stack.append( open_dict ) 381 self.structure_stack.append( key ) 382 self.structure_stack.append( open_list ) 383 self.detail_state = 'unpaired_key' 384 elif outer_state == 'function': 385 if self.detail_state != 'waiting_key': 386 self.structure_stack.append( close_list ) 387 self.structure_stack.append( key ) 388 self.detail_state = 'unpaired_key' 389 self.structure_stack.append( open_list ) 390 self.structure_stack.append( open_list ) 391 try: 392 val = cols[ 1 ] 393 if val.strip() != '': 394 self.structure_stack.append( val ) 395 self.detail_state = 'unpaired_key' 396 397 except IndexError: 398 pass 399 else: 400 if self.detail_state != 'waiting_key': 401 self.structure_stack.append( close_list ) 402 self.detail_state = 'scan_val' 403 self.structure_stack.append( key ) 404 self.structure_stack.append( open_list ) 405 self.structure_stack.append( open_list ) 406 try: 407 val = cols[ 1 ] 408 if val.strip() != '': 409 self.structure_stack.append( val ) 410 except IndexError: 411 pass
412 413
414 - def start_th( self, attrs ):
415 416 self.open_tag_stack.append( self.open_tag ) 417 self.open_tag = 'open_th' 418 outer_state = self.outer_state 419 self.text = '' 420 if outer_state in [ 'function', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]: 421 if self.section_state == 'local_contents': 422 self.detail_state = 'scan_headings'
423 424 425
426 - def end_th( self ):
427 try: 428 self.open_tag = self.open_tag_stack.pop() 429 except: 430 self.open_tag = 'open_html' 431 outer_state = self.outer_state 432 if outer_state == 'refseq': 433 if self.section_state == 'local_contents': 434 text = self.get_text() 435 cols = text.strip().split( ':', 1 ) 436 if text.strip().lower().startswith( 'category' ): 437 self.structure_stack.append( open_dict ) 438 self.structure_stack.append( cols[ 1 ] ) 439 self.structure_stack.append( open_list ) 440 self.structure_stack.append( open_dict ) 441 self.detail_state = 'found_category' 442 443 elif self.detail_state in [ 'found_category', 'may_be_val' ]: 444 if text.strip() != '': 445 if self.detail_state != 'found_category': 446 self.structure_stack.append( close_list ) 447 cols = text.split( ':' ) 448 self.structure_stack.append( cols[ 0 ] ) 449 self.structure_stack.append( open_list ) 450 try: 451 val = cols[ 1 ] 452 self.structure_stack.append( open_list ) 453 self.structure_stack.append( val ) 454 self.detail_state = 'scan_val' 455 except IndexError: 456 self.detail_state = 'may_be_val'
457 458 459 460 461
462 - def start_table( self, attrs ):
463 self.open_tag_stack.append( self.open_tag ) 464 self.open_tag = 'open_table' 465 self.text = '' 466 if self.outer_state == 'genbank': 467 if self.section_state == 'local_contents': 468 self.detail_state = 'skip' 469 elif( self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'genbank', 'additional', 'external' ] ): 470 471 if self.section_state == 'local_contents': 472 self.detail_state = 'waiting_key'
473
474 - def end_table( self ):
475 try: 476 self.open_tag = self.open_tag_stack.pop() 477 except: 478 self.open_tag = 'open_html' 479 if( self.section_state == 'local_title' ): 480 if self.outer_state == 'refseq': 481 self.section_state = 'local_contents' 482 elif self.outer_state == 'additional': 483 self.section_state = 'local_contents' 484 self.detail_state = 'scan_val' 485 else: 486 self.section_state = 'local_contents' 487 self.detail_state = 'waiting_key' 488 elif self.section_state == 'local_contents': 489 if( self.outer_state in [ 'nomenclature', 'relationships', 'locus', 'map', 'external' ] ): 490 self.structure_stack.append( close_list ) 491 elif ( self.outer_state in [ 'genbank', 'additional' ] ): 492 if self.detail_state == 'scan_val': 493 self.structure_stack.append( close_list ) 494 495 elif self.outer_state == 'refseq': 496 if self.detail_state in ['may_be_val', 'scan_val' ]: 497 self.structure_stack.append( close_list ) 498 self.structure_stack.append( close_dict ) 499 self.structure_stack.append( close_list ) 500 self.structure_stack.append( close_dict ) 501 self.detail_state = 'scan_category'
502 503
504 - def start_tr( self, attrs ):
505 top = self.open_tag 506 self.open_tag_stack.append( self.open_tag ) 507 if top == 'open_table_row': 508 if self.outer_state == 'refseq': 509 if self.section_state == 'local_contents': 510 if self.detail_state in [ 'scan_val', ]: 511 self.structure_stack.append( close_list ) 512 self.detail_state = 'may_be_val' 513 self.open_tag_stack.pop() 514 self.open_tag = 'open_table_row' 515 self.text = '' 516 outer_state = self.outer_state 517 if( outer_state in [ 'relationships', 'locus', 'function', 'genbank', 'external' 518 ] ): 519 if self.section_state == 'local_contents': 520 if self.detail_state == 'scan_val': 521 self.structure_stack.append( open_list ) 522 elif outer_state == 'map': 523 if self.section_state == 'local_contents': 524 if self.detail_state == 'scan_val': 525 self.structure_stack.append( open_list ) 526 527 elif outer_state == 'additional': 528 if self.section_state == 'local_contents': 529 self.detail_state = 'scan_val' 530 self.structure_stack.append( open_list )
531 532
533 - def end_tr( self ):
534 try: 535 self.open_tag = self.open_tag_stack.pop() 536 except: 537 self.open_tag = 'open_html' 538 if self.section_state == 'local_contents': 539 if( self.outer_state in [ 'overview', 'nomenclature', 'relationships', 540 'locus', 'genbank', 'external' ] ): 541 if self.detail_state == 'scan_val': 542 self.structure_stack.append( close_list ) 543 elif self.detail_state == 'unpaired_key': 544 self.structure_stack.append( close_list ) 545 elif self.detail_state == 'skip': 546 self.detail_state = 'scan_val' 547 elif self.detail_state == 'scan_headings': 548 self.detail_state = 'scan_val' 549 elif self.outer_state in [ 'additional', ]: 550 if self.detail_state == 'unpaired_key': 551 self.structure_stack.append( close_list ) 552 self.structure_stack.append( close_dict ) 553 self.structure_stack.append( close_list ) 554 elif self.detail_state == 'scan_val': 555 self.structure_stack.append( close_list ) 556 elif self.outer_state in [ 'function', ]: 557 if self.detail_state == 'scan_headings': 558 self.detail_state = 'scan_val' 559 elif self.detail_state == 'unpaired_key': 560 self.detail_state = 'may_be_val' 561 self.structure_stack.append( close_list ) 562 elif self.detail_state == 'scan_val': 563 self.detail_state = 'may_be_val' 564 self.structure_stack.append( close_list ) 565 elif self.outer_state in [ 'refseq', ]: 566 if self.section_state == 'local_contents': 567 if self.detail_state == 'scan_val': 568 self.structure_stack.append( close_list ) 569 self.detail_state = 'may_be_val' 570 elif self.outer_state == 'map': 571 if self.section_state == 'local_contents': 572 if self.detail_state == 'scan_val': 573 self.structure_stack.append( close_list ) 574 self.detail_state = 'may_be_val'
575 576
577 - def start_td( self, attrs ):
578 self.open_tag_stack.append( self.open_tag ) 579 self.open_tag = 'open_table_data' 580 if self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]: 581 if( self.section_state == 'local_contents' ): 582 self.text = '' 583 elif self.outer_state == 'refseq': 584 if self.section_state == 'local_contents': 585 self.text = '' 586 if self.detail_state == 'may_be_val': 587 self.structure_stack.append( open_list ) 588 self.detail_state = 'scan_val'
589
590 - def end_td( self ):
591 try: 592 self.open_tag = self.open_tag_stack.pop() 593 except: 594 self.open_tag = 'open_html' 595 596 597 598 if self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'genbank', 'additional', 'external' ]: 599 if( self.section_state == 'local_contents' ): 600 if self.detail_state == 'scan_val': 601 text = self.get_text() 602 if( text != '' ): 603 self.structure_stack.append( text ) 604 elif self.outer_state == 'function': 605 if self.section_state == 'local_contents': 606 text = self.get_text() 607 if( text != '' ): 608 if self.detail_state == 'may_be_val': 609 if text.strip() != '': 610 self.structure_stack.append( open_list ) 611 self.detail_state = 'scan_val' 612 if self.detail_state in [ 'unpaired_key', 'scan_val' ]: 613 self.structure_stack.append( text ) 614 elif self.outer_state == 'map': 615 if self.section_state == 'local_contents': 616 text = self.get_text() 617 if( text != '' ): 618 if self.detail_state == 'may_be_val': 619 if text.strip() != '': 620 self.structure_stack.append( open_list ) 621 self.detail_state = 'scan_val' 622 if self.detail_state == 'scan_val': 623 self.structure_stack.append( text ) 624 elif self.outer_state == 'refseq': 625 if self.section_state == 'local_contents': 626 if self.detail_state == 'scan_val': 627 text = self.get_text() 628 if( text != '' ): 629 self.add_text_to_object( text )
630
631 - def do_br( self, attrs ):
632 if self.outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]: 633 if( self.section_state == 'local_contents' ): 634 if self.detail_state == 'scan_val': 635 if self.is_contained_by( 'open_table_data' ): 636 text = self.get_text() 637 if( text != '' ): 638 self.structure_stack.append( text )
639 640
641 - def add_text_to_object( self, text ):
642 stack_item = self.structure_stack.pop() 643 if isinstance( stack_item, Url ): 644 if stack_item.description == '': 645 stack_item.description = text 646 self.structure_stack.append( stack_item ) 647 else: 648 self.structure_stack.append( stack_item ) 649 self.structure_stack.append( text )
650 651 652
653 - def is_contained_by( self, tag ):
654 return tag in self.open_tag_stack
655
656 - def process_structure_stack( self ):
657 params = [] 658 outer_state = self.outer_state 659 if outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'refseq', 'locus', 'map', 'genbank', 'additional', 'external' ]: 660 while len( self.structure_stack ) > 1: 661 len_stack = len( self.structure_stack ) 662 # self.print_stack() 663 for i in range ( 0, len_stack ): 664 item = self.structure_stack.pop() 665 if not is_open_token( item ): 666 params.append( item ) 667 else: break 668 if( open_list.__eq__( item ) ): 669 container = process_list( params ) 670 params.append( container ) 671 else: 672 container = process_dict( params ) 673 if len( container ) > 0: 674 params.append( container ) 675 if ( len( self.structure_stack ) == 0 ) or is_open_token( 676 self.structure_stack[ -1 ] ): 677 for j in range( 0, len( params ) ): 678 item = params.pop() 679 self.structure_stack.append( item ) 680 params = []
681 682
683 - def print_stack( self ):
684 print '%s!!!!!\n' % self.outer_state.upper() 685 for stack_item in self.structure_stack: 686 print 'stack has ' + str( stack_item ) 687 print '-----------------'
688 689 690 691 692 if( __name__ == '__main__' ): 693 handle = open( 'Hs13225.htm') 694 undo_handle = Bio.File.UndoHandle( handle ) 695 locuslink_parser = LocusLinkParser() 696 record = locuslink_parser.parse( handle ) 697 print record 698