1
2
3
4
5
6 """Code for dealing with lists of URLs (OBSOLETE).
7
8 NetCatch enables the user to scan a list of labelled urls and select
9 a subset to read into a file.
10
11 Functions:
12 get_urls_by_label
13 get_urls_by_index
14 get_urls_by_range
15 select_output_file
16
17 This module is now considered to be obsolete, and is likely to be deprecated
18 in a future release of Biopython, and later removed.
19 """
20 import os
21 import urllib
22 import sgmllib
23 from Bio import File
24
25
27 ( url_type, url ) = urllib.splittype( candidate )
28 if( url_type == None ):
29 return 0
30 ( url_host, url ) = urllib.splithost( url )
31 if( url_host == None ):
32 return 0
33 return 1
34
35 """
36 ExtractUrls.py
37
38
39 Scans a file in http format and builds a dictionary of urls
40 """
41
43
47
49 sgmllib.SGMLParser.reset( self )
50 self.urls = {}
51 self._inlink = 0
52 self._pending_url = ''
53 self.text = ''
54
56 output = ''
57 for key in self.urls.keys():
58 val = self.urls[ key ]
59 output = output + '%s : %s\n' % ( key, val )
60 return output
61
65
79
83
85 self._inlink = 1
86 for key, val in attrs:
87 if key.lower() == 'href':
88 self._pending_url = val
89
91 self._inlink = 0
92 key = self.text
93 self.text = ''
94 if not key == '':
95 key = key.replace( ' ', '_' )
96 self.urls[ key ] = self._pending_url
97
105
107 """
108 Decorator for a dictionary of links. Each link is indexed by its label.
109 Allows the user to select links of interest and read each selection into
110 its own file. The filename is contructed by appending the label with an
111 extension of html.
112
113 Files can be selected by index, range or label. The destination directory
114 defaults to the current directory. The user can specify another
115 dictionary by passing a list of path segments to the constructor.
116
117 net_catch = NetCatch()
118 net_catch = NetCatch( [ 'amylase', 'species' ] )
119 net_catch.get_all_urls()
120 net_catch.get_urls_by_label( [ 'pig', 'dog', 'cow' ] )
121 net_catch.get_urls_by_index( [ 1, 4, 6, 9 ] )
122 net_catch.get_urls_by_range( 2, 5 )
123 """
124
125 - def __init__( self, path_segments = [] ):
126 self._urls = {}
127 self._labels = []
128 assert type( path_segments ) == type( [] )
129 self.path_segments = path_segments
130 self._build_path()
131
133 base_path = os.path.join( '' )
134 for segment in self.path_segments:
135 base_path = os.path.join( base_path, segment )
136 self.base_path = base_path
137
139 i = 0
140 output = ''
141 for label in self._labels:
142 output = output + '%d %s: %s\n' % ( i, label, self._urls[ label ] )
143 i = i + 1
144 return output
145
149
155
157 url_opener = urllib.URLopener()
158 i = 0
159 for label in self._labels:
160 base_path = self.base_path
161 name = '%s%d.htm' % ( label, i )
162 full_path = os.path.join( base_path, name )
163 out_handle = open( full_path , "wb" )
164 i = i + 1
165 url = self._urls[ label ]
166 url_handle = url_opener.open( url )
167 contents = url_handle.read()
168 out_handle.write( contents )
169 url_opener.close( )
170 out_handle.close()
171
173 url_opener = urllib.URLopener()
174 for label in labels:
175 base_path = self.base_path
176 name = '%s.htm' % ( label )
177 full_path = os.path.join( base_path, name )
178 out_handle = open( full_path , "wb" )
179 url = self._urls[ label ]
180 url_handle = url_opener.open( url )
181 contents = url_handle.read()
182 out_handle.write( contents )
183 url_opener.close( )
184 out_handle.close( )
185
187 url_opener = urllib.URLopener()
188 for index in indices:
189 base_path = self.base_path
190 name = '%s.htm' % self._labels[ index ]
191 full_path = os.path.join( base_path, name )
192 out_handle = open( full_path , "wb" )
193 label = self._labels[ index ]
194 url = self._urls[ label ]
195 url_handle = url_opener.open( url )
196 contents = url_handle.read()
197 out_handle.write( contents )
198 url_opener.close( )
199 out_handle.close( )
200
202 url_opener = urllib.URLopener( )
203 for index in range( low, hi ):
204 base_path = self.base_path
205 name = '%s.htm' % self._labels[ index ]
206 full_path = os.path.join( base_path, name )
207 out_handle = open( full_path , "wb" )
208 label = self._labels[ index ]
209 url = self._urls[ label ]
210 url_handle = url_opener.open( url )
211 contents = url_handle.read()
212 out_handle.write( contents )
213 url_opener.close( )
214 out_handle.close( )
215