Package Bio :: Module NetCatch
[hide private]
[frames] | no frames]

Source Code for Module Bio.NetCatch

  1  # Copyright 2002 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Code for dealing with lists of URLs (DEPRECATED). 
  7   
  8  This module is now deprecated, and will be removed in a future release of 
  9  Biopython. 
 10   
 11  NetCatch enables the user to scan a list of labelled urls and select  
 12  a subset to read into a file. 
 13   
 14  Functions: 
 15  get_urls_by_label 
 16  get_urls_by_index 
 17  get_urls_by_range 
 18  select_output_file 
 19  """ 
 20   
 21  import warnings 
 22  warnings.warn("Bio.NetCatch is deprecated, and will be removed in a future"\ 
 23                " release of Biopython.  If you want to continue to use this"\ 
 24                " code, please get in contact with the Biopython developers"\ 
 25                " via the mailing lists to avoid its permanent removal from"\ 
 26                " Biopython.", DeprecationWarning) 
 27  import os 
 28  import urllib 
 29  import sgmllib 
 30  from Bio import File 
 31   
 32   
33 -def is_absolute_url( candidate ):
34 ( url_type, url ) = urllib.splittype( candidate ) 35 if( url_type == None ): 36 return 0 37 ( url_host, url ) = urllib.splithost( url ) 38 if( url_host == None ): 39 return 0 40 return 1
41 42 """ 43 ExtractUrls.py 44 45 46 Scans a file in http format and builds a dictionary of urls 47 """ 48
49 -class ExtractUrls( sgmllib.SGMLParser ):
50
51 - def __init__( self ):
52 sgmllib.SGMLParser.__init__( self ) 53 self.reset()
54
55 - def reset( self ):
56 sgmllib.SGMLParser.reset( self ) 57 self.urls = {} 58 self._inlink = 0 59 self._pending_url = '' 60 self.text = ''
61
62 - def __str__( self ):
63 output = '' 64 for key in self.urls.keys(): 65 val = self.urls[ key ] 66 output = output + '%s : %s\n' % ( key, val ) 67 return output
68
69 - def extract_urls(self, handle):
70 self.feed(handle) 71 return self.urls
72
73 - def feed(self, handle):
74 """feed(self, handle ) 75 76 Feed in data for scanning. handle is a file-like object 77 containing html. 78 79 """ 80 if isinstance(handle, File.UndoHandle): 81 uhandle = handle 82 else: 83 uhandle = File.UndoHandle(handle) 84 text = uhandle.read() 85 sgmllib.SGMLParser.feed( self, text )
86
87 - def handle_data(self, data):
88 if( self._inlink ): 89 self.text = self.text + data
90
91 - def start_a( self, attrs ):
92 self._inlink = 1 93 for key, val in attrs: 94 if key.lower() == 'href': 95 self._pending_url = val
96
97 - def end_a( self ):
98 self._inlink = 0 99 key = self.text 100 self.text = '' 101 if not key == '': 102 key = key.replace( ' ', '_' ) 103 self.urls[ key ] = self._pending_url
104
105 -class Url:
106
107 - def __init__( self, label, url ):
108 assert is_absolute_url( url ) 109 assert type( label ) == type( '' ) 110 self.label = label 111 self.url = url
112
113 -class NetCatch:
114 """ 115 Decorator for a dictionary of links. Each link is indexed by its label. 116 Allows the user to select links of interest and read each selection into 117 its own file. The filename is contructed by appending the label with an 118 extension of html. 119 120 Files can be selected by index, range or label. The destination directory 121 defaults to the current directory. The user can specify another 122 dictionary by passing a list of path segments to the constructor. 123 124 net_catch = NetCatch() 125 net_catch = NetCatch( [ 'amylase', 'species' ] ) 126 net_catch.get_all_urls() 127 net_catch.get_urls_by_label( [ 'pig', 'dog', 'cow' ] ) 128 net_catch.get_urls_by_index( [ 1, 4, 6, 9 ] ) 129 net_catch.get_urls_by_range( 2, 5 ) 130 """ 131
132 - def __init__( self, path_segments = [] ):
133 self._urls = {} 134 self._labels = [] 135 assert type( path_segments ) == type( [] ) 136 self.path_segments = path_segments 137 self._build_path()
138
139 - def _build_path( self ):
140 base_path = os.path.join( '' ) 141 for segment in self.path_segments: 142 base_path = os.path.join( base_path, segment ) 143 self.base_path = base_path
144
145 - def __str__( self ):
146 i = 0 147 output = '' 148 for label in self._labels: 149 output = output + '%d %s: %s\n' % ( i, label, self._urls[ label ] ) 150 i = i + 1 151 return output
152
153 - def import_dict( self, href_dict ):
154 for ( key, val ) in href_dict.items(): 155 self.add_url( key, val )
156
157 - def add_url( self, label, url ):
158 assert is_absolute_url( url ) 159 assert type( label ) == type( '' ) 160 self._labels.append( label ) 161 self._urls[ label ] = url
162
163 - def get_all_urls( self ):
164 url_opener = urllib.URLopener() 165 i = 0 166 for label in self._labels: 167 base_path = self.base_path 168 name = '%s%d.htm' % ( label, i ) 169 full_path = os.path.join( base_path, name ) 170 out_handle = open( full_path , "wb" ) 171 i = i + 1 172 url = self._urls[ label ] 173 url_handle = url_opener.open( url ) 174 contents = url_handle.read() 175 out_handle.write( contents ) 176 url_opener.close( ) 177 out_handle.close()
178
179 - def get_urls_by_label( self, labels ):
180 url_opener = urllib.URLopener() 181 for label in labels: 182 base_path = self.base_path 183 name = '%s.htm' % ( label ) 184 full_path = os.path.join( base_path, name ) 185 out_handle = open( full_path , "wb" ) 186 url = self._urls[ label ] 187 url_handle = url_opener.open( url ) 188 contents = url_handle.read() 189 out_handle.write( contents ) 190 url_opener.close( ) 191 out_handle.close( )
192
193 - def get_urls_by_index( self, indices ):
194 url_opener = urllib.URLopener() 195 for index in indices: 196 base_path = self.base_path 197 name = '%s.htm' % self._labels[ index ] 198 full_path = os.path.join( base_path, name ) 199 out_handle = open( full_path , "wb" ) 200 label = self._labels[ index ] 201 url = self._urls[ label ] 202 url_handle = url_opener.open( url ) 203 contents = url_handle.read() 204 out_handle.write( contents ) 205 url_opener.close( ) 206 out_handle.close( )
207
208 - def get_urls_by_range( self, low, hi ):
209 url_opener = urllib.URLopener( ) 210 for index in range( low, hi ): 211 base_path = self.base_path 212 name = '%s.htm' % self._labels[ index ] 213 full_path = os.path.join( base_path, name ) 214 out_handle = open( full_path , "wb" ) 215 label = self._labels[ index ] 216 url = self._urls[ label ] 217 url_handle = url_opener.open( url ) 218 contents = url_handle.read() 219 out_handle.write( contents ) 220 url_opener.close( ) 221 out_handle.close( )
222