Package Bio :: Module NetCatch
[hide private]
[frames] | no frames]

Source Code for Module Bio.NetCatch

  1  # Copyright 2002 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Code for dealing with lists of URLs (OBSOLETE). 
  7   
  8  NetCatch enables the user to scan a list of labelled urls and select  
  9  a subset to read into a file. 
 10   
 11  Functions: 
 12  get_urls_by_label 
 13  get_urls_by_index 
 14  get_urls_by_range 
 15  select_output_file 
 16   
 17  This module is now considered to be obsolete, and is likely to be deprecated 
 18  in a future release of Biopython, and later removed. 
 19  """ 
 20  import os 
 21  import urllib 
 22  import sgmllib 
 23  from Bio import File 
 24   
 25   
26 -def is_absolute_url( candidate ):
27 ( url_type, url ) = urllib.splittype( candidate ) 28 if( url_type == None ): 29 return 0 30 ( url_host, url ) = urllib.splithost( url ) 31 if( url_host == None ): 32 return 0 33 return 1
34 35 """ 36 ExtractUrls.py 37 38 39 Scans a file in http format and builds a dictionary of urls 40 """ 41
42 -class ExtractUrls( sgmllib.SGMLParser ):
43
44 - def __init__( self ):
45 sgmllib.SGMLParser.__init__( self ) 46 self.reset()
47
48 - def reset( self ):
49 sgmllib.SGMLParser.reset( self ) 50 self.urls = {} 51 self._inlink = 0 52 self._pending_url = '' 53 self.text = ''
54
55 - def __str__( self ):
56 output = '' 57 for key in self.urls.keys(): 58 val = self.urls[ key ] 59 output = output + '%s : %s\n' % ( key, val ) 60 return output
61
62 - def extract_urls(self, handle):
63 self.feed(handle) 64 return self.urls
65
66 - def feed(self, handle):
67 """feed(self, handle ) 68 69 Feed in data for scanning. handle is a file-like object 70 containing html. 71 72 """ 73 if isinstance(handle, File.UndoHandle): 74 uhandle = handle 75 else: 76 uhandle = File.UndoHandle(handle) 77 text = uhandle.read() 78 sgmllib.SGMLParser.feed( self, text )
79
80 - def handle_data(self, data):
81 if( self._inlink ): 82 self.text = self.text + data
83
84 - def start_a( self, attrs ):
85 self._inlink = 1 86 for key, val in attrs: 87 if key.lower() == 'href': 88 self._pending_url = val
89
90 - def end_a( self ):
91 self._inlink = 0 92 key = self.text 93 self.text = '' 94 if not key == '': 95 key = key.replace( ' ', '_' ) 96 self.urls[ key ] = self._pending_url
97
98 -class Url:
99
100 - def __init__( self, label, url ):
101 assert is_absolute_url( url ) 102 assert type( label ) == type( '' ) 103 self.label = label 104 self.url = url
105
106 -class NetCatch:
107 """ 108 Decorator for a dictionary of links. Each link is indexed by its label. 109 Allows the user to select links of interest and read each selection into 110 its own file. The filename is contructed by appending the label with an 111 extension of html. 112 113 Files can be selected by index, range or label. The destination directory 114 defaults to the current directory. The user can specify another 115 dictionary by passing a list of path segments to the constructor. 116 117 net_catch = NetCatch() 118 net_catch = NetCatch( [ 'amylase', 'species' ] ) 119 net_catch.get_all_urls() 120 net_catch.get_urls_by_label( [ 'pig', 'dog', 'cow' ] ) 121 net_catch.get_urls_by_index( [ 1, 4, 6, 9 ] ) 122 net_catch.get_urls_by_range( 2, 5 ) 123 """ 124
125 - def __init__( self, path_segments = [] ):
126 self._urls = {} 127 self._labels = [] 128 assert type( path_segments ) == type( [] ) 129 self.path_segments = path_segments 130 self._build_path()
131
132 - def _build_path( self ):
133 base_path = os.path.join( '' ) 134 for segment in self.path_segments: 135 base_path = os.path.join( base_path, segment ) 136 self.base_path = base_path
137
138 - def __str__( self ):
139 i = 0 140 output = '' 141 for label in self._labels: 142 output = output + '%d %s: %s\n' % ( i, label, self._urls[ label ] ) 143 i = i + 1 144 return output
145
146 - def import_dict( self, href_dict ):
147 for ( key, val ) in href_dict.items(): 148 self.add_url( key, val )
149
150 - def add_url( self, label, url ):
151 assert is_absolute_url( url ) 152 assert type( label ) == type( '' ) 153 self._labels.append( label ) 154 self._urls[ label ] = url
155
156 - def get_all_urls( self ):
157 url_opener = urllib.URLopener() 158 i = 0 159 for label in self._labels: 160 base_path = self.base_path 161 name = '%s%d.htm' % ( label, i ) 162 full_path = os.path.join( base_path, name ) 163 out_handle = open( full_path , "wb" ) 164 i = i + 1 165 url = self._urls[ label ] 166 url_handle = url_opener.open( url ) 167 contents = url_handle.read() 168 out_handle.write( contents ) 169 url_opener.close( ) 170 out_handle.close()
171
172 - def get_urls_by_label( self, labels ):
173 url_opener = urllib.URLopener() 174 for label in labels: 175 base_path = self.base_path 176 name = '%s.htm' % ( label ) 177 full_path = os.path.join( base_path, name ) 178 out_handle = open( full_path , "wb" ) 179 url = self._urls[ label ] 180 url_handle = url_opener.open( url ) 181 contents = url_handle.read() 182 out_handle.write( contents ) 183 url_opener.close( ) 184 out_handle.close( )
185
186 - def get_urls_by_index( self, indices ):
187 url_opener = urllib.URLopener() 188 for index in indices: 189 base_path = self.base_path 190 name = '%s.htm' % self._labels[ index ] 191 full_path = os.path.join( base_path, name ) 192 out_handle = open( full_path , "wb" ) 193 label = self._labels[ index ] 194 url = self._urls[ label ] 195 url_handle = url_opener.open( url ) 196 contents = url_handle.read() 197 out_handle.write( contents ) 198 url_opener.close( ) 199 out_handle.close( )
200
201 - def get_urls_by_range( self, low, hi ):
202 url_opener = urllib.URLopener( ) 203 for index in range( low, hi ): 204 base_path = self.base_path 205 name = '%s.htm' % self._labels[ index ] 206 full_path = os.path.join( base_path, name ) 207 out_handle = open( full_path , "wb" ) 208 label = self._labels[ index ] 209 url = self._urls[ label ] 210 url_handle = url_opener.open( url ) 211 contents = url_handle.read() 212 out_handle.write( contents ) 213 url_opener.close( ) 214 out_handle.close( )
215