1
2
3
4
5
6 """
7 This module provides code to work with PubMed from the NCBI (DEPRECATED).
8
9 This module has been deprecated and is likely to be removed in a future
10 release of Biopython. Please use Bio.Entrez instead, which is described
11 in the Biopython Tutorial.
12
13 See also:
14 http://www.ncbi.nlm.nih.gov/PubMed/
15
16 Online documentation for linking to PubMed is available at:
17 http://www.ncbi.nlm.nih.gov/PubMed/linking.html
18
19
20 Classes:
21 Dictionary Access PubMed articles using a dictionary interface.
22
23 Functions:
24 search_for Search PubMed.
25 find_related Find related articles in PubMed.
26 download_many Download many articles from PubMed in batch mode.
27
28 """
29
30 import warnings
31 warnings.warn("Bio.PubMed has been deprecated, and we intend to remove it in" \
32 +" a future release of Biopython. Please use Bio.Entrez"\
33 +" instead as described in the Tutorial. If you need help" \
34 +" with this transition, or wish to continue to use this code,"\
35 +" please get in contact via the mailing lists.", \
36 DeprecationWarning)
37
38 import re
39 import sgmllib
40
41 from Bio import File
42 from Bio import Entrez
43 from Bio import Medline
44
46 """Access PubMed using a read-only dictionary interface (DEPRECATED).
47
48 Please use the Bio.Entrez.efetch(...) function instead as described in the
49 Biopython Tutorial.
50 """
52 """Dictionary(parser=None)
53
54 Create a new Dictionary to access PubMed. parser is an optional
55 parser (e.g. Medline.RecordParser) object to change the results
56 into another form. If set to None, then the raw contents of the
57 file will be returned.
58
59 """
60 self.parser = parser
61
63 raise NotImplementedError("PubMed contains lots of entries")
65 raise NotImplementedError("This is a read-only dictionary")
67 raise NotImplementedError("This is a read-only dictionary")
69 raise NotImplementedError("This is a read-only dictionary")
71 raise NotImplementedError("You don't need to do this...")
73 raise NotImplementedError("You don't really want to do this...")
75 raise NotImplementedError("You don't really want to do this...")
77 raise NotImplementedError("You don't really want to do this...")
78
80 """S.has_key(id) -> bool"""
81 try:
82 self[id]
83 except KeyError:
84 return 0
85 return 1
86
87 - def get(self, id, failobj=None):
88 try:
89 return self[id]
90 except KeyError:
91 return failobj
92
94 """S.__getitem__(id) -> object
95
96 Return the Medline entry. id is either the Medline Unique ID
97 or the Pubmed ID of the article. Raises a KeyError if there's an
98 error.
99
100 """
101 try:
102 handle = Entrez.efetch(
103 db="pubmed", id=id, retmode='text', rettype='medlars')
104 except IOError, x:
105
106
107
108 raise KeyError(x)
109 if self.parser is not None:
110 return self.parser.parse(handle)
111 return handle.read()
112
113 -def search_for(search, reldate=None, mindate=None, maxdate=None,
114 batchsize=100, callback_fn=None, start_id=0, max_ids=None):
115 """Search PubMed, returns a list of IDs (DEPRECATED).
116
117 Please use Bio.Entrez instead as described in the Biopython Tutorial.
118
119 Search PubMed and return a list of the PMID's that match the
120 criteria. search is the search string used to search the
121 database. reldate is the number of dates prior to the current
122 date to restrict the search. mindate and maxdate are the dates to
123 restrict the search, e.g. 2002/01/01. batchsize specifies the
124 number of ids to return at one time. By default, it is set to
125 10000, the maximum. callback_fn is an optional callback function
126 that will be called as passed a PMID as results are retrieved.
127 start_id specifies the index of the first id to retrieve and
128 max_ids specifies the maximum number of id's to retrieve.
129
130 XXX The date parameters don't seem to be working with NCBI's
131 script. Please let me know if you can get it to work.
132
133 """
134 params = {
135 'db' : 'pubmed',
136 'term' : search,
137 'reldate' : reldate,
138 'mindate' : mindate,
139 'maxdate' : maxdate
140 }
141
142
143 ids = []
144 while max_ids is None or len(ids) < max_ids:
145 start = start_id + len(ids)
146 max = batchsize
147 if max_ids is not None and max > max_ids - len(ids):
148 max = max_ids - len(ids)
149
150 params['retstart'] = start
151 params['retmax'] = max
152 h = Entrez.esearch(**params)
153 record = Entrez.read(h)
154 idlist = record["IdList"]
155 ids.extend(idlist)
156 if callback_fn is not None:
157
158 for id in idlist:
159 callback_fn(id)
160 if len(idlist) < max:
161 break
162 return ids
163
199 def start_id(self, attributes):
200 self.in_id = 1
201 def end_id(self):
202 self.in_id = 0
203 def start_link(self, attributes):
204 self.in_link = 1
205 def end_link(self):
206 self.in_link = 0
207 _not_pmid_re = re.compile(r'\D')
208 def handle_data(self, data):
209 if not self.in_link or not self.in_id:
210 return
211
212
213
214
215 if self._not_pmid_re.search(data):
216 raise ValueError(\
217 "I expected an ID, but '%s' doesn't look like one." % \
218 repr(data))
219 self.ids.append(data)
220
221 parser = ResultParser()
222 if type(pmid) is type([]):
223 pmid = ','.join(pmid)
224 h = Entrez.elink(dbfrom='pubmed', id=pmid)
225 parser.feed(h.read())
226 return parser.ids
227
228 -def download_many(ids, callback_fn, broken_fn=None,
229 batchsize=500, parser=None):
230 """Download multiple PubMed records, no return value (DEPRECATED).
231
232 Please use Bio.Entrez instead as described in the Biopython Tutorial.
233
234 Download many records from PubMed. ids is a list of either the
235 Medline Unique ID or the PubMed ID's of the articles. Each time a
236 record is downloaded, callback_fn is called with the text of the
237 record. broken_fn is an optional function that is called with the
238 id of records that were not able to be downloaded. batchsize is the
239 number of records to request each time.
240
241 """
242
243
244
245
246 if batchsize > 500 or batchsize < 1:
247 raise ValueError("batchsize must be between 1 and 500")
248 current_batchsize = batchsize
249
250
251
252
253
254
255
256
257
258
259
260 nsuccesses = 0
261 while ids:
262 if current_batchsize > len(ids):
263 current_batchsize = len(ids)
264
265 id_str = ','.join(ids[:current_batchsize])
266
267 try:
268
269
270 handle = Entrez.efetch(
271 db="pubmed", id=id_str, retmode='text', rettype='medlars')
272
273
274
275
276
277 results = handle.read()
278 num_ids = 0
279 for x in Medline.Iterator(File.StringHandle(results)):
280 num_ids = num_ids + 1
281 if num_ids != current_batchsize:
282 raise IOError
283 handle = File.StringHandle(results)
284 except IOError:
285 if current_batchsize == 1:
286
287
288 id = ids.pop(0)
289 if broken_fn is not None:
290 broken_fn(id)
291 else:
292
293
294 current_batchsize = current_batchsize / 2
295 nsuccesses = 0
296 continue
297 nsuccesses = nsuccesses + 1
298
299
300
301 idnum = 0
302 for rec in Medline.Iterator(handle, parser):
303 callback_fn(ids[idnum], rec)
304 idnum = idnum + 1
305
306 ids = ids[current_batchsize:]
307
308
309
310 if nsuccesses >= 2 and current_batchsize < batchsize:
311 current_batchsize = current_batchsize * 2
312 if current_batchsize > batchsize:
313 current_batchsize = batchsize
314