Package Bio :: Package SeqIO :: Module StockholmIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.StockholmIO

  1  # Copyright 2006, 2007 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Bio.SeqIO support for the "stockholm" (aka PFAM) file format. 
  8   
  9  You were expected to use this module via the Bio.SeqIO functions. 
 10  This module has now been replaced by Bio.AlignIO.PhylipIO, and is 
 11  deprecated.""" 
 12   
 13  import warnings 
 14  warnings.warn("Bio.SeqIO.StockholmIO is deprecated.  You can continue to read" \ 
 15                + " and write 'clustal' files with Bio.SeqIO, but this is now" \ 
 16                + " handled via Bio.AlignIO internally.", 
 17                DeprecationWarning) 
 18   
 19  from Bio.Alphabet import single_letter_alphabet 
 20  from Bio.Seq import Seq 
 21  from Bio.SeqRecord import SeqRecord 
 22  from Interfaces import InterlacedSequenceIterator, SequentialSequenceWriter 
 23   
24 -class StockholmIterator(InterlacedSequenceIterator) :
25 """Loads a Stockholm file from PFAM into SeqRecord objects. 26 27 The entire file is loaded, and any sequence can be accessed using 28 the [index] notation. 29 30 This parser will detect if the Stockholm file follows the PFAM conventions 31 for sequence specific meta-data (lines starting #=GS and #=GR) and 32 populates the SeqRecord fields accordingly. 33 34 Any annotation which does not follow the PFAM conventions is currently 35 ignored. 36 37 If an accession is provided for an entry in the meta data, IT WILL NOT 38 be used as the record.id (it will be recorded in the record's annotations). 39 This is because some files have (sub) sequences from different parts of 40 the same accession (differentiated by different start-end positions). 41 42 Wrap-around alignments are not supported - each sequences must be on 43 a single line. However, interlaced sequences should work. 44 45 For more information on the file format, please see: 46 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format 47 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html 48 49 For consistency with BioPerl and EMBOSS we call this the "stockholm" 50 format. 51 """ 52 53 #These dictionaries should be kept in sync with those 54 #defined in the PfamStockholmWriter class. 55 pfam_gr_mapping = {"SS" : "secondary_structure", 56 "SA" : "surface_accessibility", 57 "TM" : "transmembrane", 58 "PP" : "posterior_probability", 59 "LI" : "ligand_binding", 60 "AS" : "active_site", 61 "IN" : "intron"} 62 #Following dictionary deliberately does not cover AC, DE or DR 63 pfam_gs_mapping = {"OS" : "organism", 64 "OC" : "organism_classification", 65 "LO" : "look"} 66 67 #TODO - Should the default be Gapped(single_letter_alphabet) instead?
68 - def __init__(self, handle, alphabet = single_letter_alphabet) :
69 """Create a StockholmIterator object (which returns SeqRecord objects). 70 71 handle - input file 72 alphabet - optional alphabet 73 """ 74 self.handle = handle 75 self.alphabet = alphabet 76 77 self.sequences = {} 78 self.ids = [] 79 self.ParseAlignment() 80 self.move_start()
81
82 - def ParseAlignment(self):
83 line = self.handle.readline() 84 if not line: 85 #Empty file - just give up. 86 return 87 if not line.strip() == '# STOCKHOLM 1.0': 88 raise ValueError("Did not find STOCKHOLM header") 89 #import sys 90 #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' 91 92 # Note: If this file follows the PFAM conventions, there should be 93 # a line containing the number of sequences, e.g. "#=GF SQ 67" 94 # We do not check for this - perhaps we should, and verify that 95 # if present it agrees with our parsing. 96 97 seqs = {} 98 ids = [] 99 gs = {} 100 gr = {} 101 passed_end_alignment = False 102 while 1: 103 line = self.handle.readline() 104 if not line: break #end of file 105 line = line.strip() #remove trailing \n 106 if line == "//" : 107 #The "//" line indicates the end of the alignment. 108 #There may still be more meta-data 109 passed_end_alignment = True 110 elif line == "" : 111 #blank line, ignore 112 pass 113 elif line[0] <> "#" : 114 #Sequence 115 #Format: "<seqname> <sequence>" 116 assert not passed_end_alignment 117 parts = [x.strip() for x in line.split(" ",1)] 118 if len(parts) <> 2 : 119 #This might be someone attempting to store a zero length sequence? 120 raise ValueError("Could not split line into identifier " \ 121 + "and sequence:\n" + line) 122 id, seq = parts 123 if id not in ids : 124 ids.append(id) 125 seqs.setdefault(id, '') 126 seqs[id] += seq.replace(".","-") 127 elif len(line) >= 5 : 128 #Comment line or meta-data 129 if line[:5] == "#=GF " : 130 #Generic per-File annotation, free text 131 #Format: #=GF <feature> <free text> 132 pass 133 elif line[:5] == '#=GC ': 134 #Generic per-Column annotation, exactly 1 char per column 135 #Format: "#=GC <feature> <exactly 1 char per column>" 136 pass 137 elif line[:5] == '#=GS ': 138 #Generic per-Sequence annotation, free text 139 #Format: "#=GS <seqname> <feature> <free text>" 140 id, feature, text = line[5:].strip().split(None,2) 141 #if id not in ids : 142 # ids.append(id) 143 if id not in gs : 144 gs[id] = {} 145 if feature not in gs[id] : 146 gs[id][feature] = [text] 147 else : 148 gs[id][feature].append(text) 149 elif line[:5] == "#=GR " : 150 #Generic per-Sequence AND per-Column markup 151 #Format: "#=GR <seqname> <feature> <exactly 1 char per column>" 152 id, feature, text = line[5:].strip().split(None,2) 153 #if id not in ids : 154 # ids.append(id) 155 if id not in gr : 156 gr[id] = {} 157 if feature not in gr[id]: 158 gr[id][feature] = "" 159 gr[id][feature] += text.strip() # append to any previous entry 160 #TODO - Should we check the length matches the alignment length? 161 # For iterlaced sequences the GR data can be split over 162 # multiple lines 163 #Next line... 164 165 166 assert len(seqs) <= len(ids) 167 #assert len(gs) <= len(ids) 168 #assert len(gr) <= len(ids) 169 170 #This is some paranoia based on some pathelogical test cases. 171 if ids and seqs : 172 align_len = None 173 for id, seq in seqs.iteritems() : 174 assert id in ids 175 if align_len is None : 176 align_len = len(seq) 177 elif align_len <> len(seq) : 178 raise ValueError("Sequences have different lengths, or repeated identifier") 179 180 181 self.ids = ids 182 self.sequences = seqs 183 self.seq_annotation = gs 184 self.seq_col_annotation = gr 185 self.move_start()
186
187 - def __len__(self) :
188 return len(self.ids)
189
190 - def _identifier_split(self, identifier) :
191 """Returns (name,start,end) string tuple from an identier""" 192 if identifier.find("/")<>-1 : 193 start_end = identifier.split("/",1)[1] 194 if start_end.count("-")==1 : 195 start, end = start_end.split("-") 196 name = identifier.split("/",1)[0] 197 return (name, start, end) 198 return (identifier, None, None)
199
200 - def _get_meta_data(self, identifier, meta_dict) :
201 """Takes an itentifier and returns dict of all meta-data matching it. 202 203 For example, given "Q9PN73_CAMJE/149-220" will return all matches to 204 this or "Q9PN73_CAMJE" which the identifier without its /start-end 205 suffix. 206 207 In the example below, the suffix is required to match the AC, but must 208 be removed to match the OS and OC meta-data. 209 210 # STOCKHOLM 1.0 211 #=GS Q9PN73_CAMJE/149-220 AC Q9PN73 212 ... 213 Q9PN73_CAMJE/149-220 NKA... 214 ... 215 #=GS Q9PN73_CAMJE OS Campylobacter jejuni 216 #=GS Q9PN73_CAMJE OC Bacteria 217 218 This function will return an empty dictionary if no data is found""" 219 name, start, end = self._identifier_split(identifier) 220 if name==identifier : 221 identifier_keys = [identifier] 222 else : 223 identifier_keys = [identifier, name] 224 answer = {} 225 for identifier_key in identifier_keys : 226 try : 227 for feature_key in meta_dict[identifier_key] : 228 answer[feature_key] = meta_dict[identifier_key][feature_key] 229 except KeyError : 230 pass 231 return answer
232
233 - def _populate_meta_data(self, identifier, record) :
234 """Adds meta-date to a SecRecord's annotations dictionary. 235 236 This function applies the PFAM conventions.""" 237 238 seq_data = self._get_meta_data(identifier, self.seq_annotation) 239 for feature in seq_data : 240 #Note this dictionary contains lists! 241 if feature=="AC" : #ACcession number 242 assert len(seq_data[feature])==1 243 record.annotations["accession"]=seq_data[feature][0] 244 elif feature=="DE" : #DEscription 245 record.description = "\n".join(seq_data[feature]) 246 elif feature=="DR" : #Database Reference 247 #Should we try and parse the strings? 248 record.dbxrefs = seq_data[feature] 249 elif feature in self.pfam_gs_mapping : 250 record.annotations[self.pfam_gs_mapping[feature]] = ", ".join(seq_data[feature]) 251 else : 252 #Ignore it? 253 record.annotations["GS:" + feature] = ", ".join(seq_data[feature]) 254 255 seq_col_data = self._get_meta_data(identifier, self.seq_col_annotation) 256 for feature in seq_col_data : 257 #Note this dictionary contains strings! 258 if feature in self.pfam_gr_mapping : 259 record.annotations[self.pfam_gr_mapping[feature]] = seq_col_data[feature] 260 else : 261 #Ignore it? 262 record.annotations["GR:" + feature] = seq_col_data[feature]
263
264 - def __getitem__(self, i):
265 """Provides random access to the SeqRecords.""" 266 if i < 0 or i >= len(self.ids) : raise ValueError 267 id = self.ids[i] 268 seq_len = len(self.sequences[id]) 269 270 name, start, end = self._identifier_split(id) 271 272 record = SeqRecord(Seq(self.sequences[id], self.alphabet), id=id, name=name, description=id) 273 274 if start : record.annotations["start"] = int(start) 275 if end : record.annotations["end"] = int(end) 276 277 #will be overridden by _populate_meta_data if an explicit accession is provided: 278 record.annotations["accession"]=name 279 280 self._populate_meta_data(id, record) 281 282 #DO NOT TOUCH self._n 283 return record
284
285 -class StockholmWriter(SequentialSequenceWriter):
286 """Class to write PFAM style Stockholm format files. 287 288 Note that sequences and their annotation are recorded 289 together (rather than having a block of annotation followed 290 by a block of aligned sequences). 291 """ 292 293 #These dictionaries should be kept in sync with those 294 #defined in the PfamStockholmIterator class. 295 pfam_gr_mapping = { "secondary_structure" : "SS", 296 "surface_accessibility" : "SA", 297 "transmembrane" : "TM", 298 "posterior_probability" : "PP", 299 "ligand_binding" : "LI", 300 "active_site" : "AS", 301 "intron" : "IN"} 302 #Following dictionary deliberately does not cover AC, DE or DR 303 pfam_gs_mapping = {"organism" : "OS", 304 "organism_classification" : "OC", 305 "look" : "LO"} 306
307 - def __init__(self, handle):
308 """Creates the writer object. 309 310 Use the method write_file() to actually record your sequence records.""" 311 SequentialSequenceWriter.__init__(self, handle) 312 self._ids_written = [] 313 self._length_of_sequences = None
314
315 - def write_header(self, count):
316 """Must supply the number of records (count).""" 317 SequentialSequenceWriter.write_header(self) # sets flags 318 self.handle.write("# STOCKHOLM 1.0\n") 319 self.handle.write("#=GF SQ %i\n" % count)
320
321 - def write_file(self, records) :
322 """Use this to write an entire file containing the given records. 323 324 records - A list or iterator returning SeqRecord objects. 325 If len(records) is not supported, then it will be 326 converted into a list. 327 328 This method should only be called once for each file.""" 329 try : 330 #This will work for a list, and some of the SeqIO 331 #iterators too, like the StockholmIterator 332 count = len(records) 333 except TypeError : 334 #Probably have an standard iterator, not a list... 335 records = list(records) 336 count = len(records) 337 338 if count == 0 : 339 raise ValueError("Must have at least one sequence") 340 341 self.write_header(count) 342 self.write_records(records) 343 self.write_footer()
344 #Don't automatically close the file. This would prevent 345 #things like writing concatenated alignments as used for 346 #phylogenetic bootstrapping (usually done with phylip). 347 #self.close() 348
349 - def write_record(self, record):
350 """Write a single Stockholm record to the file.""" 351 assert self._header_written 352 assert not self._footer_written 353 self._record_written = True 354 355 if self._length_of_sequences is None : 356 self._length_of_sequences = len(record.seq) 357 elif self._length_of_sequences <> len(record.seq) : 358 raise ValueError("Sequences must all be the same length") 359 360 if self._length_of_sequences == 0 : 361 raise ValueError("Non-empty sequences are required") 362 363 #For the case for stockholm to stockholm, try and use record.name 364 seq_name = record.id 365 if record.name is not None : 366 if "accession" in record.annotations : 367 if record.id == record.annotations["accession"] : 368 seq_name = record.name 369 370 #In the Stockholm file format, spaces are not allowed in the id 371 seq_name = seq_name.replace(" ","_") 372 373 if "start" in record.annotations \ 374 and "end" in record.annotations : 375 suffix = "/%s-%s" % (str(record.annotations["start"]), 376 str(record.annotations["end"])) 377 if seq_name[-len(suffix):] <> suffix : 378 seq_name = "%s/%s-%s" % (seq_name, 379 str(record.annotations["start"]), 380 str(record.annotations["end"])) 381 382 if seq_name in self._ids_written : 383 raise ValueError("Duplicate record identifier: %s" % seq_name) 384 self._ids_written.append(seq_name) 385 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring())) 386 387 #The recommended placement for GS lines (per sequence annotation) 388 #is above the alignment (as a header block) or just below the 389 #corresponding sequence. 390 # 391 #The recommended placement for GR lines (per sequence per column 392 #annotation such as secondary structure) is just below the 393 #corresponding sequence. 394 # 395 #We put both just below the corresponding sequence as this allows 396 #us to write the file using a single pass through the records. 397 398 #AC = Accession 399 if "accession" in record.annotations : 400 self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.annotations["accession"]))) 401 elif record.id : 402 self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.id))) 403 404 #DE = description 405 if record.description : 406 self.handle.write("#=GS %s DE %s\n" % (seq_name, self.clean(record.description))) 407 408 #DE = database links 409 for xref in record.dbxrefs : 410 self.handle.write("#=GS %s DR %s\n" % (seq_name, self.clean(xref))) 411 412 #GS/GR = other per sequence annotation 413 for key in record.annotations : 414 if key in self.pfam_gs_mapping : 415 self.handle.write("#=GS %s %s %s\n" \ 416 % (seq_name, 417 self.clean(self.pfam_gs_mapping[key]), 418 self.clean(str(record.annotations[key])))) 419 elif key in self.pfam_gr_mapping : 420 if len(str(record.annotations[key]))==len(record.seq) : 421 self.handle.write("#=GR %s %s %s\n" \ 422 % (seq_name, 423 self.clean(self.pfam_gr_mapping[key]), 424 self.clean((record.annotations[key])))) 425 else : 426 #Should we print a warning? 427 pass 428 else : 429 #It doesn't follow the PFAM standards, but should we record this data anyway? 430 pass
431 432 433 434
446 447 if __name__ == "__main__" : 448 print "Testing..." 449 from cStringIO import StringIO 450 451 452 # This example with its slightly odd (partial) annotation is from here: 453 # http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html 454 # I don't know what the "GR_O31699/88-139_IN ..." line is meant to be. 455 sth_example = \ 456 """# STOCKHOLM 1.0 457 #=GF ID CBS 458 #=GF AC PF00571 459 #=GF DE CBS domain 460 #=GF AU Bateman A 461 #=GF CC CBS domains are small intracellular modules mostly found 462 #=GF CC in 2 or four copies within a protein. 463 #=GF SQ 67 464 #=GS O31698/18-71 AC O31698 465 #=GS O83071/192-246 AC O83071 466 #=GS O83071/259-312 AC O83071 467 #=GS O31698/88-139 AC O31698 468 #=GS O31698/88-139 OS Bacillus subtilis 469 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS 470 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777 471 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY 472 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE 473 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS 474 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH 475 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE 476 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH 477 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH 478 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE 479 #=GR O31699/88-139 AS ________________*__________________________ 480 #=GR_O31699/88-139_IN ____________1______________2__________0____ 481 // 482 """ 483 484 # Interlaced example from BioPerl documentation. Also note the blank line. 485 # http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format 486 sth_example2 = \ 487 """# STOCKHOLM 1.0 488 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>.. 489 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU 490 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..-- 491 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU 492 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>---- 493 494 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>............... 495 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU 496 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>--------------- 497 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU 498 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>--------------- 499 //""" 500 501 print "--------" 502 print "StockholmIterator(stockholm alignment file)" 503 iterator = StockholmIterator(StringIO(sth_example)) 504 count=0 505 for record in iterator : 506 count=count+1 507 assert count == 5 508 #Check the first record... 509 assert record.id == 'O31699/88-139' 510 assert record.name == 'O31699' 511 assert record.description == 'O31699/88-139' 512 assert len(record.annotations)==4 513 assert record.annotations["accession"]=='O31699' 514 assert record.annotations["start"]==88 515 assert record.annotations["end"]==139 516 assert record.annotations["active_site"]=='________________*__________________________' 517 518 iterator = StockholmIterator(StringIO(sth_example)) 519 count=0 520 for record in iterator : 521 count=count+1 522 break 523 assert count==1 524 #Check the last record... 525 assert record.id == 'O83071/192-246' 526 assert record.name == 'O83071' 527 assert record.description == 'O83071/192-246' 528 assert len(record.annotations)==4 529 assert record.annotations["accession"]=='O83071' 530 assert record.annotations["start"]==192 531 assert record.annotations["end"]==246 532 assert record.annotations["surface_accessibility"]=="999887756453524252..55152525....36463774777" 533 534 assert [r.id for r in StockholmIterator(StringIO(sth_example))] \ 535 == ['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139'] 536 537 assert [r.name for r in StockholmIterator(StringIO(sth_example))] \ 538 == ['O83071', 'O83071', 'O31698', 'O31698', 'O31699'] 539 540 assert [r.description for r in StockholmIterator(StringIO(sth_example))] \ 541 == ['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139'] 542 543 544 print "--------" 545 print "StockholmIterator(interlaced stockholm alignment file)" 546 iterator = StockholmIterator(StringIO(sth_example2)) 547 record = iterator.next() 548 assert record.id == "AP001509.1" 549 assert len(record.seq) == 104 550 assert "secondary_structure" in record.annotations 551 assert len(record.annotations["secondary_structure"]) == 104 552 record = iterator.next() 553 assert record.id == "AE007476.1" 554 assert len(record.seq) == 104 555 assert "secondary_structure" in record.annotations 556 assert len(record.annotations["secondary_structure"]) == 104 557 record = iterator.next() 558 assert record is None 559