Package Bio :: Package Medline
[hide private]
[frames] | no frames]

Source Code for Package Bio.Medline

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with Medline. 
  8   
  9  Classes: 
 10  Record           Holds Medline data. 
 11  Iterator         Iterates over a file containing Medline records. 
 12  RecordParser     Parses a Medline record into a Record object. 
 13   
 14  _Scanner         Scans a Medline record. 
 15  _RecordConsumer  Consumes Medline data to a Record object. 
 16   
 17  """ 
 18   
 19  from Bio import File 
 20  from Bio.ParserSupport import * 
 21   
22 -class Record:
23 """Holds information from a Medline record. 24 25 Members: 26 id Medline ID. 27 pubmed_id Pubmed ID. 28 29 mesh_headings List of MeSH headings. 30 mesh_tree_numbers List of MeSH Tree Numbers. 31 mesh_subheadings List of MeSH subheadings. 32 33 abstract The abstract. 34 comments List of references to comments. 35 abstract_author The author of the abstract. 36 english_abstract "A" if a foreign article has an english abstract. 37 38 source Bibliographic information. 39 publication_types List of type of publication. 40 number_of_references Number of bibliographic references, for REVIEW pubs. 41 42 authors List of authors. 43 no_author "A" for anonymous. 44 address Address of the first author. 45 46 journal_title_code Three-character code assigned to the journal. 47 title_abbreviation Abbreviation of journal title. 48 issn International Standard Serial Number. 49 journal_subsets List of strings that describe journal groupings. 50 country Country of publication 51 languages List of languages of the article. 52 53 title Article title. 54 transliterated_title Title in the original language. 55 call_number The call number of the journal issue. 56 issue_part_supplement Issue, part, or supplement of journal published. 57 volume_issue Volume number of journal. 58 publication_date Date published (string). 59 year Year published (string). 60 pagination Inclusive pages of an indexed item. 61 62 special_list Coding for the database of the citation. 63 64 substance_name Preferred name for a chemical or drug. 65 gene_symbols List of abbreviated gene names. 66 secondary_source_ids List of source databanks and accessions. 67 identifications List of research grant or contract numbers. 68 registry_numbers List of CAS or EC numbers. 69 70 personal_name_as_subjects List of individuals who are subjects. 71 72 record_originators List of people who worked on record. 73 entry_date Date record made machine readable (YYMMDD). 74 entry_month YYMM entered into Medline. 75 class_update_date Date touched by Class Maintenance action (string). 76 last_revision_date Date for minor revision. 77 major_revision_date Date for major revision. 78 79 undefined List of lines that don't match the standard. 80 81 """
82 - def __init__(self):
83 self.id = '' 84 self.pubmed_id = '' 85 86 self.mesh_headings = [] 87 self.mesh_tree_numbers = [] 88 self.mesh_subheadings = [] 89 90 self.abstract = '' 91 self.comments = [] 92 self.abstract_author = '' 93 self.english_abstract = '' 94 95 self.source = '' 96 self.publication_types = [] 97 self.number_of_references = '' 98 99 self.authors = [] 100 self.no_author = '' 101 self.address = '' 102 103 self.journal_title_code = '' 104 self.title_abbreviation = '' 105 self.issn = '' 106 self.journal_subsets = [] 107 self.country = '' 108 self.languages = [] 109 110 self.title = '' 111 self.transliterated_title = '' 112 self.call_number = '' 113 self.issue_part_supplement = '' 114 self.volume_issue = '' 115 self.publication_date = '' 116 self.year = '' 117 self.pagination = '' 118 119 self.special_list = '' 120 121 self.substance_name = '' 122 self.gene_symbols = [] 123 self.secondary_source_ids = [] 124 self.identifications = [] 125 self.registry_numbers = [] 126 127 self.personal_name_as_subjects = [] 128 129 self.record_originators = [] 130 self.entry_date = '' 131 self.entry_month = '' 132 self.class_update_date = '' 133 self.last_revision_date = '' 134 self.major_revision_date = '' 135 136 self.undefined = []
137
138 -class Iterator:
139 """Returns one record at a time from a file of Medline records. 140 141 Methods: 142 next Return the next record from the stream, or None. 143 144 """
145 - def __init__(self, handle, parser=None):
146 """__init__(self, handle, parser=None) 147 148 Create a new iterator. handle is a file-like object. parser 149 is an optional Parser object to change the results into another form. 150 If set to None, then the raw contents of the file will be returned. 151 152 """ 153 self._handle = handle 154 self._parser = parser
155
156 - def __iter__(self):
157 return self
158
159 - def next(self):
160 """next(self) -> object 161 162 Return the next medline record from the file. If no more records, 163 return None. 164 165 """ 166 lines = [] 167 for line in self.handle: 168 lines.append(line) 169 if line.strip()=='': 170 break 171 else: 172 raise StopIteration 173 174 data = ''.join(lines) 175 176 if self._parser is not None: 177 return self._parser.parse_str(data) 178 return data
179
180 -class RecordParser(AbstractParser):
181 """Parses Medline data into a Record object. 182 183 """
184 - def __init__(self):
185 self._scanner = _Scanner() 186 self._consumer = _RecordConsumer()
187
188 - def parse(self, handle):
189 self._scanner.feed(handle, self._consumer) 190 return self._consumer.data
191
192 -class _Scanner:
193 """Scans a Medline record. 194 195 """ 196 # map the category qualifier to an event 197 _categories = { 198 "AA" : "abstract_author", 199 "AB" : "abstract", 200 "AD" : "address", 201 "AU" : "author", 202 "CA" : "call_number", 203 "CM" : "comments", 204 "CU" : "class_update_date", 205 "CY" : "country", 206 "DA" : "entry_date", 207 "DP" : "publication_date", 208 "EA" : "english_abstract", 209 "EM" : "entry_month", 210 "GS" : "gene_symbol", 211 "ID" : "identification", 212 "IP" : "issue_part_supplement", 213 "IS" : "issn", 214 "JC" : "journal_title_code", 215 "LA" : "language", 216 "LI" : "special_list", 217 "LR" : "last_revision_date", 218 "MH" : "mesh_heading", 219 "MN" : "mesh_tree_number", 220 "MR" : "major_revision_date", 221 "NI" : "no_author", 222 "NM" : "substance_name", 223 "PG" : "pagination", 224 "PS" : "personal_name_as_subject", 225 "PT" : "publication_type", 226 "RF" : "number_of_references", 227 "RN" : "cas_registry_number", 228 "RO" : "record_originator", 229 "SB" : "journal_subset", 230 "SH" : "subheadings", 231 "SI" : "secondary_source_id", 232 "SO" : "source", 233 "TA" : "title_abbreviation", 234 "TI" : "title", 235 "TT" : "transliterated_title", 236 "UI" : "unique_identifier", 237 "VI" : "volume_issue", 238 "YR" : "year", 239 240 # Not documented. 241 "PMID" : "pubmed_id", 242 } 243
244 - def feed(self, handle, consumer):
245 """feed(self, handle, consumer) 246 247 Feed in a Medline unit record for scanning. handle is a file-like 248 object that contains a Medline record. consumer is a 249 Consumer object that will receive events as the report is scanned. 250 251 """ 252 if isinstance(handle, File.UndoHandle): 253 uhandle = handle 254 else: 255 uhandle = File.UndoHandle(handle) 256 257 # Read the Entrez header information, if it exists 258 if attempt_read_and_call(uhandle, consumer.noevent, start='Entrez'): 259 read_and_call(uhandle, consumer.noevent, start='----------------') 260 self._scan_record(uhandle, consumer)
261
262 - def _scan_record(self, uhandle, consumer):
263 consumer.start_record() 264 265 prev_qualifier = None 266 while 1: 267 line = uhandle.readline() 268 if is_blank_line(line): 269 break 270 271 # There are 2 possible formats for a line: 272 # TI - Epidemiology of mycobacterial resistance (especially Mycoba 273 # tuberculosis). 274 # 1) qualifier + '-' + data 275 # 2) continuation, with just data 276 277 # Check to see if it's a continuation line. 278 qualifier = line[:4].rstrip() 279 # There's a bug in some MH lines where the "isolation & 280 # purification" subheading gets split across lines and 281 # purification at the beginning of the line, with only 1 282 # space. 283 if line[0] == '\t' or qualifier == '' or \ 284 line[:13] == ' purification': 285 if prev_qualifier is None: 286 raise ValueError, "Continuation on first line\n%s" % line 287 qualifier = prev_qualifier 288 else: 289 # Make sure it contains a '-' 290 if len(line) < 5 or line[4] != '-': 291 raise ValueError, \ 292 "I don't understand the format of line %s" % line 293 prev_qualifier = qualifier 294 295 try: 296 fn = getattr(consumer, self._categories[qualifier]) 297 except KeyError: 298 # call an 'undefined' function for 299 consumer.undefined(line) 300 else: 301 fn(line) 302 303 consumer.end_record()
304
305 -class _RecordConsumer(AbstractConsumer):
306 """Consumer that converts a Medline record to a Record object. 307 308 Members: 309 data Record with Medline data. 310 311 """
312 - def __init__(self):
313 self.data = None
314
315 - def start_record(self):
316 self.data = Record()
317
318 - def end_record(self):
319 self._clean_record(self.data)
320
321 - def abstract_author(self, line):
322 self.data.abstract_author = self._clean(line)
323
324 - def abstract(self, line):
325 self.data.abstract = self.data.abstract + self._clean(line, rstrip=0)
326
327 - def address(self, line):
328 self.data.address = self.data.address + self._clean(line, rstrip=0)
329
330 - def author(self, line):
331 self.data.authors.append(self._clean(line))
332
333 - def call_number(self, line):
334 assert not self.data.call_number, "call numbers already defined" 335 self.data.call_number = self._clean(line)
336
337 - def comments(self, line):
338 self.data.comments.append(self._clean(line))
339
340 - def class_update_date(self, line):
341 assert not self.data.class_update_date, \ 342 "class update date already defined" 343 self.data.class_update_date = self._clean(line)
344
345 - def country(self, line):
346 assert not self.data.country, "country already defined" 347 self.data.country = self._clean(line)
348
349 - def entry_date(self, line):
350 assert not self.data.entry_date, "entry date already defined" 351 self.data.entry_date = self._clean(line)
352
353 - def publication_date(self, line):
354 assert not self.data.publication_date, \ 355 "publication date already defined" 356 self.data.publication_date = self._clean(line)
357
358 - def english_abstract(self, line):
359 assert not self.data.english_abstract, \ 360 "english abstract already defined" 361 self.data.english_abstract = self._clean(line)
362
363 - def entry_month(self, line):
364 assert not self.data.entry_month, \ 365 "entry month already defined" 366 self.data.entry_month = self._clean(line)
367
368 - def gene_symbol(self, line):
369 self.data.gene_symbols.append(self._clean(line))
370
371 - def identification(self, line):
372 self.data.identifications.append(self._clean(line))
373
374 - def issue_part_supplement(self, line):
375 assert not self.data.issue_part_supplement, \ 376 "issue/part/supplement already defined" 377 self.data.issue_part_supplement = self._clean(line)
378
379 - def issn(self, line):
380 assert not self.data.issn, "ISSN already defined" 381 self.data.issn = self._clean(line)
382
383 - def journal_title_code(self, line):
384 assert not self.data.journal_title_code, \ 385 "journal title code already defined" 386 self.data.journal_title_code = self._clean(line)
387
388 - def language(self, line):
389 self.data.languages.append(self._clean(line))
390
391 - def special_list(self, line):
392 assert not self.data.special_list, "special list already defined" 393 self.data.special_list = self._clean(line)
394
395 - def last_revision_date(self, line):
396 assert not self.data.last_revision_date, \ 397 "last revision date already defined" 398 self.data.last_revision_date = self._clean(line)
399
400 - def mesh_heading(self, line):
401 # Check to see whether this is a new MH line, or a 402 # continuation of an old one. If it's a continuation of an 403 # old one, append it to the previous line. 404 # See PMID 12107064 for an example, found by Dan Rubin. 405 if line[:2] == 'MH': 406 self.data.mesh_headings.append(self._clean(line)) 407 else: 408 prev_mh = self.data.mesh_headings.pop() 409 continued_mh = self._clean(line) 410 self.data.mesh_headings.append("%s %s" % (prev_mh, continued_mh))
411
412 - def mesh_tree_number(self, line):
413 self.data.mesh_tree_numbers.append(self._clean(line))
414
415 - def major_revision_date(self, line):
416 assert not self.data.major_revision_date, \ 417 "major revision date already defined" 418 self.data.major_revision_date = self._clean(line)
419
420 - def no_author(self, line):
421 assert not self.data.no_author, "no author already defined" 422 self.data.no_author = self._clean(line)
423
424 - def substance_name(self, line):
425 assert not self.data.substance_name, "substance name already defined" 426 self.data.substance_name = self._clean(line)
427
428 - def pagination(self, line):
429 assert not self.data.pagination, "pagination already defined" 430 self.data.pagination = self._clean(line)
431
432 - def personal_name_as_subject(self, line):
433 self.data.personal_name_as_subjects.append(self._clean(line))
434
435 - def publication_type(self, line):
436 self.data.publication_types.append(self._clean(line))
437
438 - def number_of_references(self, line):
439 assert not self.data.number_of_references, \ 440 "num of references already defined" 441 self.data.number_of_references = self._clean(line)
442
443 - def cas_registry_number(self, line):
444 self.data.registry_numbers.append(self._clean(line))
445
446 - def record_originator(self, line):
447 self.data.record_originators.append(self._clean(line))
448
449 - def journal_subset(self, line):
450 self.data.journal_subsets.append(self._clean(line))
451
452 - def subheadings(self, line):
453 self.data.mesh_subheadings.append(self._clean(line))
454
455 - def secondary_source_id(self, line):
456 self.data.secondary_source_ids.append(self._clean(line))
457
458 - def source(self, line):
459 self.data.source = self.data.source + self._clean(line, rstrip=0)
460
461 - def title_abbreviation(self, line):
462 self.data.title_abbreviation = self.data.title_abbreviation + \ 463 self._clean(line, rstrip=0)
464
465 - def title(self, line):
466 self.data.title = self.data.title + self._clean(line, rstrip=0)
467
468 - def transliterated_title(self, line):
469 self.data.transliterated_title = self.data.transliterated_title + \ 470 self._clean(line, rstrip=0)
471
472 - def unique_identifier(self, line):
473 assert not self.data.id, "id already defined" 474 self.data.id = self._clean(line)
475
476 - def volume_issue(self, line):
477 assert not self.data.volume_issue, "volume issue already defined" 478 self.data.volume_issue = self._clean(line)
479
480 - def year(self, line):
481 assert not self.data.year, "year already defined" 482 self.data.year = self._clean(line)
483
484 - def pubmed_id(self, line):
485 assert not self.data.pubmed_id, "PMID already defined" 486 self.data.pubmed_id = self._clean(line)
487
488 - def undefined(self, line):
489 # Records sometimes contain lines with qualifiers that don't match 490 # any in the standard. All these lines go into another variable. 491 # Some undefined qualifiers: 492 # 4098, 4099, 4100, 4101 493 # 634 494 # NP, PID, EDAT, MHDA 495 496 self.data.undefined.append(line)
497
498 - def _clean(self, line, rstrip=1):
499 tab = line.find('\t') 500 if tab >= 0: 501 nospace = line[tab+1:] 502 elif line[:13] == ' purification': 503 nospace = line[1:] 504 else: 505 nospace = line[6:] 506 if rstrip: 507 return nospace.rstrip() 508 return nospace
509 510 _needs_stripping = [ 511 'abstract', 'source', 'address', 'title_abbreviation', 512 'title', 'transliterated_title' 513 ]
514 - def _clean_record(self, rec):
515 # Remove trailing newlines 516 for m in self._needs_stripping: 517 value = getattr(rec, m) 518 setattr(rec, m, value.rstrip())
519