1
2
3
4
5
6 """
7 This module provides code to work with PubMed from the NCBI.
8 http://www.ncbi.nlm.nih.gov/PubMed/
9
10 Online documentation for linking to PubMed is available at:
11 http://www.ncbi.nlm.nih.gov/PubMed/linking.html
12
13
14 Classes:
15 Dictionary Access PubMed articles using a dictionary interface.
16
17 Functions:
18 search_for Search PubMed.
19 find_related Find related articles in PubMed.
20 download_many Download many articles from PubMed in batch mode.
21
22 """
23
24 import re
25 import sgmllib
26
27 from Bio import File
28 from Bio import Entrez
29 from Bio import Medline
30
32 """Access PubMed using a read-only dictionary interface.
33
34 Methods:
35
36 """
38 """Dictionary(parser=None)
39
40 Create a new Dictionary to access PubMed. parser is an optional
41 parser (e.g. Medline.RecordParser) object to change the results
42 into another form. If set to None, then the raw contents of the
43 file will be returned.
44
45 """
46 self.parser = parser
47
49 raise NotImplementedError, "PubMed contains lots of entries"
51 raise NotImplementedError, "This is a read-only dictionary"
53 raise NotImplementedError, "This is a read-only dictionary"
55 raise NotImplementedError, "This is a read-only dictionary"
57 raise NotImplementedError, "You don't need to do this..."
59 raise NotImplementedError, "You don't really want to do this..."
61 raise NotImplementedError, "You don't really want to do this..."
63 raise NotImplementedError, "You don't really want to do this..."
64
66 """S.has_key(id) -> bool"""
67 try:
68 self[id]
69 except KeyError:
70 return 0
71 return 1
72
73 - def get(self, id, failobj=None):
74 try:
75 return self[id]
76 except KeyError:
77 return failobj
78 raise "How did I get here?"
79
81 """S.__getitem__(id) -> object
82
83 Return the Medline entry. id is either the Medline Unique ID
84 or the Pubmed ID of the article. Raises a KeyError if there's an
85 error.
86
87 """
88 try:
89 handle = Entrez.efetch(
90 db="pubmed", id=id, retmode='text', rettype='medlars')
91 except IOError, x:
92
93
94
95 raise KeyError, x
96 if self.parser is not None:
97 return self.parser.parse(handle)
98 return handle.read()
99
100 -def search_for(search, reldate=None, mindate=None, maxdate=None,
101 batchsize=100, callback_fn=None, start_id=0, max_ids=None):
102 """search_for(search[, reldate][, mindate][, maxdate]
103 [, batchsize][, callback_fn][, start_id][, max_ids]) -> ids
104
105 Search PubMed and return a list of the PMID's that match the
106 criteria. search is the search string used to search the
107 database. reldate is the number of dates prior to the current
108 date to restrict the search. mindate and maxdate are the dates to
109 restrict the search, e.g. 2002/01/01. batchsize specifies the
110 number of ids to return at one time. By default, it is set to
111 10000, the maximum. callback_fn is an optional callback function
112 that will be called as passed a PMID as results are retrieved.
113 start_id specifies the index of the first id to retrieve and
114 max_ids specifies the maximum number of id's to retrieve.
115
116 XXX The date parameters don't seem to be working with NCBI's
117 script. Please let me know if you can get it to work.
118
119 """
120 class ResultParser(sgmllib.SGMLParser):
121
122
123
124
125
126 def __init__(self):
127 sgmllib.SGMLParser.__init__(self)
128 self.ids = []
129 self.in_id = 0
130 def start_id(self, attributes):
131 self.in_id = 1
132 def end_id(self):
133 self.in_id = 0
134 _not_pmid_re = re.compile(r'\D')
135 def handle_data(self, data):
136 if not self.in_id:
137 return
138
139 data = data.strip()
140 if not data:
141 return
142
143
144
145
146 if self._not_pmid_re.search(data):
147 raise ValueError, \
148 "I expected an ID, but %s doesn't look like one." % \
149 repr(data)
150 self.ids.append(data)
151
152 params = {
153 'db' : 'pubmed',
154 'term' : search,
155 'reldate' : reldate,
156 'mindate' : mindate,
157 'maxdate' : maxdate
158 }
159 for k, v in params.items():
160 if v is None:
161 del params[k]
162
163 ids = []
164 while max_ids is None or len(ids) < max_ids:
165 parser = ResultParser()
166
167 start = start_id + len(ids)
168 max = batchsize
169 if max_ids is not None and max > max_ids - len(ids):
170 max = max_ids - len(ids)
171
172 params['retstart'] = start
173 params['retmax'] = max
174 h = Entrez.esearch(**params)
175 parser.feed(h.read())
176 ids.extend(parser.ids)
177 if callback_fn is not None:
178
179 for id in parser.ids:
180 callback_fn(id)
181 if len(parser.ids) < max or not parser.ids:
182 break
183 return ids
184
207 def start_id(self, attributes):
208 self.in_id = 1
209 def end_id(self):
210 self.in_id = 0
211 def start_link(self, attributes):
212 self.in_link = 1
213 def end_link(self):
214 self.in_link = 0
215 _not_pmid_re = re.compile(r'\D')
216 def handle_data(self, data):
217 if not self.in_link or not self.in_id:
218 return
219
220
221
222
223 if self._not_pmid_re.search(data):
224 raise ValueError, \
225 "I expected an ID, but '%s' doesn't look like one." % \
226 repr(data)
227 self.ids.append(data)
228
229 parser = ResultParser()
230 if type(pmid) is type([]):
231 pmid = ','.join(pmid)
232 h = Entrez.elink(dbfrom='pubmed', id=pmid)
233 parser.feed(h.read())
234 return parser.ids
235
236 -def download_many(ids, callback_fn, broken_fn=None,
237 batchsize=500, parser=None):
238 """download_many(ids, callback_fn[, broken_fn][, batchsize])
239
240 Download many records from PubMed. ids is a list of either the
241 Medline Unique ID or the PubMed ID's of the articles. Each time a
242 record is downloaded, callback_fn is called with the text of the
243 record. broken_fn is an optional function that is called with the
244 id of records that were not able to be downloaded. batchsize is the
245 number of records to request each time.
246
247 """
248
249
250
251
252 if batchsize > 500 or batchsize < 1:
253 raise ValueError, "batchsize must be between 1 and 500"
254 current_batchsize = batchsize
255
256
257
258
259
260
261
262
263
264
265
266 nsuccesses = 0
267 while ids:
268 if current_batchsize > len(ids):
269 current_batchsize = len(ids)
270
271 id_str = ','.join(ids[:current_batchsize])
272
273 try:
274
275
276 handle = Entrez.efetch(
277 db="pubmed", id=id_str, retmode='text', rettype='medlars')
278
279
280
281
282
283 results = handle.read()
284 num_ids = 0
285 for x in Medline.Iterator(File.StringHandle(results)):
286 num_ids = num_ids + 1
287 if num_ids != current_batchsize:
288 raise IOError
289 handle = File.StringHandle(results)
290 except IOError:
291 if current_batchsize == 1:
292
293
294 id = ids.pop(0)
295 if broken_fn is not None:
296 broken_fn(id)
297 else:
298
299
300 current_batchsize = current_batchsize / 2
301 nsuccesses = 0
302 continue
303 nsuccesses = nsuccesses + 1
304
305
306
307 idnum = 0
308 for rec in Medline.Iterator(handle, parser):
309 callback_fn(ids[idnum], rec)
310 idnum = idnum + 1
311
312 ids = ids[current_batchsize:]
313
314
315
316 if nsuccesses >= 2 and current_batchsize < batchsize:
317 current_batchsize = current_batchsize * 2
318 if current_batchsize > batchsize:
319 current_batchsize = batchsize
320