1
2
3
4
5
6 """
7 This module provides code to work with Medline.
8
9 Classes:
10 Record A dictionary holding Medline data.
11
12 Functions:
13 read Reads one Medline record
14 parse Allows you to iterate over a bunch of Medline records
15
16 Deprecated classes:
17 Iterator Iterates over a file containing Medline records.
18 RecordParser Parses a Medline record into a Record object.
19
20 _Scanner Scans a Medline record.
21 _RecordConsumer Consumes Medline data to a Record object.
22
23 """
24
26 """A dictionary holding information from a Medline record.
27 All data are stored under the mnemonic appearing in the Medline
28 file. These mnemonics have the following interpretations:
29
30 Mnemonic Description
31 AB Abstract
32 CI Copyright Information
33 AD Affiliation
34 IRAD Investigator Affiliation
35 AID Article Identifier
36 AU Author
37 FAU Full Author
38 CN Corporate Author
39 DCOM Date Completed
40 DA Date Created
41 LR Date Last Revised
42 DEP Date of Electronic Publication
43 DP Date of Publication
44 EDAT Entrez Date
45 GS Gene Symbol
46 GN General Note
47 GR Grant Number
48 IR Investigator Name
49 FIR Full Investigator Name
50 IS ISSN
51 IP Issue
52 TA Journal Title Abbreviation
53 JT Journal Title
54 LA Language
55 LID Location Identifier
56 MID Manuscript Identifier
57 MHDA MeSH Date
58 MH MeSH Terms
59 JID NLM Unique ID
60 RF Number of References
61 OAB Other Abstract
62 OCI Other Copyright Information
63 OID Other ID
64 OT Other Term
65 OTO Other Term Owner
66 OWN Owner
67 PG Pagination
68 PS Personal Name as Subject
69 FPS Full Personal Name as Subject
70 PL Place of Publication
71 PHST Publication History Status
72 PST Publication Status
73 PT Publication Type
74 PUBM Publishing Model
75 PMC PubMed Central Identifier
76 PMID PubMed Unique Identifier
77 RN Registry Number/EC Number
78 NM Substance Name
79 SI Secondary Source ID
80 SO Source
81 SFM Space Flight Mission
82 STAT Status
83 SB Subset
84 TI Title
85 TT Transliterated Title
86 VI Volume
87 CON Comment on
88 CIN Comment in
89 EIN Erratum in
90 EFR Erratum for
91 CRI Corrected and Republished in
92 CRF Corrected and Republished from
93 PRIN Partial retraction in
94 PROF Partial retraction of
95 RPI Republished in
96 RPF Republished from
97 RIN Retraction in
98 ROF Retraction of
99 UIN Update in
100 UOF Update of
101 SPIN Summary for patients in
102 ORI Original report in
103 """
160
161
163 """Read Medline records one by one from the handle.
164
165 The handle is either is a Medline file, a file-like object, or a list
166 of lines describing one or more Medline records.
167
168 Typical usage:
169
170 from Bio import Medline
171 handle = open("mymedlinefile")
172 records = Medline.parse(handle)
173 for record in record:
174 print record['TI']
175
176 """
177
178 textkeys = ("ID", "PMID", "SO", "RF", "NI", "JC", "TA", "IS", "CY", "TT",
179 "CA", "IP", "VI", "DP", "YR", "PG", "LID", "DA", "LR", "OWN",
180 "STAT", "DCOM", "PUBM", "DEP", "PL", "JID", "SB", "PMC",
181 "EDAT", "MHDA", "PST", "AB", "AD", "EA", "TI", "JT")
182 handle = iter(handle)
183
184 for line in handle:
185 line = line.rstrip()
186 if line:
187 break
188 else:
189 return
190 record = Record()
191 finished = False
192 while not finished:
193 if line[:6]==" ":
194 record[key].append(line[6:])
195 elif line:
196 key = line[:4].rstrip()
197 if not key in record:
198 record[key] = []
199 record[key].append(line[6:])
200 try:
201 line = handle.next()
202 except StopIteration:
203 finished = True
204 else:
205 line = line.rstrip()
206 if line:
207 continue
208
209 for key in textkeys:
210 if key in record:
211 record[key] = " ".join(record[key])
212 if record:
213 yield record
214 record = Record()
215
217 """Read a single Medline records from the handle.
218
219 The handle is either is a Medline file, a file-like object, or a list
220 of lines describing a Medline record.
221
222 Typical usage:
223
224 from Bio import Medline
225 handle = open("mymedlinefile")
226 record = Medline.read(handle)
227 print record['TI']
228
229 """
230 records = parse(handle)
231 return records.next()
232
233
234
235 from Bio import File
236 from Bio.ParserSupport import *
237
239 """Returns one record at a time from a file of Medline records.
240
241 Methods:
242 next Return the next record from the stream, or None.
243
244 """
245 - def __init__(self, handle, parser=None):
246 """__init__(self, handle, parser=None)
247
248 Create a new iterator. handle is a file-like object. parser
249 is an optional Parser object to change the results into another form.
250 If set to None, then the raw contents of the file will be returned.
251
252 """
253 import warnings
254 warnings.warn("Bio.Medline.Iterator is deprecated. Instead of Bio.Medline.Iterator(handle, Bio.Medline.RecordParser()), please use Bio.Medline.parse(handle)", DeprecationWarning)
255 self._handle = handle
256 self._parser = parser
257
260
262 """next(self) -> object
263
264 Return the next medline record from the file. If no more records,
265 return None.
266
267 """
268 lines = []
269 for line in self._handle:
270 lines.append(line)
271 if line.strip()=='':
272 break
273 else:
274 raise StopIteration
275
276 data = ''.join(lines)
277
278 if self._parser is not None:
279 return self._parser.parse_str(data)
280 return data
281
283 """Parses Medline data into a Record object.
284
285 """
287 import warnings
288 warnings.warn("Bio.Medline.RecordParser is deprecated. Instead of Bio.Medline.RecordParser().parse(handle)), please use Bio.Medline.read(handle)", DeprecationWarning)
289 self._scanner = _Scanner()
290 self._consumer = _RecordConsumer()
291
292 - def parse(self, handle):
293 self._scanner.feed(handle, self._consumer)
294 return self._consumer.data
295
297 """Scans a Medline record.
298
299 """
300
301 _categories = {
302 "AA" : "abstract_author",
303 "AB" : "abstract",
304 "AD" : "address",
305 "AU" : "author",
306 "CA" : "call_number",
307 "CM" : "comments",
308 "CU" : "class_update_date",
309 "CY" : "country",
310 "DA" : "entry_date",
311 "DP" : "publication_date",
312 "EA" : "english_abstract",
313 "EM" : "entry_month",
314 "GS" : "gene_symbol",
315 "ID" : "identification",
316 "IP" : "issue_part_supplement",
317 "IS" : "issn",
318 "JC" : "journal_title_code",
319 "LA" : "language",
320 "LI" : "special_list",
321 "LR" : "last_revision_date",
322 "MH" : "mesh_heading",
323 "MN" : "mesh_tree_number",
324 "MR" : "major_revision_date",
325 "NI" : "no_author",
326 "NM" : "substance_name",
327 "PG" : "pagination",
328 "PS" : "personal_name_as_subject",
329 "PT" : "publication_type",
330 "RF" : "number_of_references",
331 "RN" : "cas_registry_number",
332 "RO" : "record_originator",
333 "SB" : "journal_subset",
334 "SH" : "subheadings",
335 "SI" : "secondary_source_id",
336 "SO" : "source",
337 "TA" : "title_abbreviation",
338 "TI" : "title",
339 "TT" : "transliterated_title",
340 "UI" : "unique_identifier",
341 "VI" : "volume_issue",
342 "YR" : "year",
343
344
345 "PMID" : "pubmed_id",
346 }
347
348 - def feed(self, handle, consumer):
365
367 consumer.start_record()
368
369 prev_qualifier = None
370 while 1:
371 line = uhandle.readline()
372 if is_blank_line(line):
373 break
374
375
376
377
378
379
380
381
382 qualifier = line[:4].rstrip()
383
384
385
386
387 if line[0] == '\t' or qualifier == '' or \
388 line[:13] == ' purification':
389 if prev_qualifier is None:
390 raise ValueError("Continuation on first line\n%s" % line)
391 qualifier = prev_qualifier
392 else:
393
394 if len(line) < 5 or line[4] != '-':
395 raise ValueError(\
396 "I don't understand the format of line %s" % line)
397 prev_qualifier = qualifier
398
399 try:
400 fn = getattr(consumer, self._categories[qualifier])
401 except KeyError:
402
403 consumer.undefined(line)
404 else:
405 fn(line)
406
407 consumer.end_record()
408
410 """Consumer that converts a Medline record to a Record object.
411
412 Members:
413 data Record with Medline data.
414
415 """
418
421
424
427
430
433
436
440
443
448
452
453 - def entry_date(self, line):
454 assert not self.data.entry_date, "entry date already defined"
455 self.data.entry_date = self._clean(line)
456
461
466
467 - def entry_month(self, line):
468 assert not self.data.entry_month, \
469 "entry month already defined"
470 self.data.entry_month = self._clean(line)
471
474
477
482
483 - def issn(self, line):
486
491
494
498
503
505
506
507
508
509 if line[:2] == 'MH':
510 self.data.mesh_headings.append(self._clean(line))
511 else:
512 prev_mh = self.data.mesh_headings.pop()
513 continued_mh = self._clean(line)
514 self.data.mesh_headings.append("%s %s" % (prev_mh, continued_mh))
515
518
523
527
531
533 assert not self.data.pagination, "pagination already defined"
534 self.data.pagination = self._clean(line)
535
538
541
546
549
552
555
558
561
564
568
571
575
579
583
584 - def year(self, line):
587
591
601
602 - def _clean(self, line, rstrip=1):
603 tab = line.find('\t')
604 if tab >= 0:
605 nospace = line[tab+1:]
606 elif line[:13] == ' purification':
607 nospace = line[1:]
608 else:
609 nospace = line[6:]
610 if rstrip:
611 return nospace.rstrip()
612 return nospace
613
614 _needs_stripping = [
615 'abstract', 'source', 'address', 'title_abbreviation',
616 'title', 'transliterated_title'
617 ]
623