Package Bio :: Package AlignIO :: Module StockholmIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.StockholmIO

  1  # Copyright 2006-2008 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  from Bio.Align.Generic import Alignment 
  7  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
  8   
9 -class StockholmWriter(SequentialAlignmentWriter) :
10 """Stockholm/PFAM alignment writer.""" 11 12 #These dictionaries should be kept in sync with those 13 #defined in the StockholmIterator class. 14 pfam_gr_mapping = {"secondary_structure" : "SS", 15 "surface_accessibility" : "SA", 16 "transmembrane" : "TM", 17 "posterior_probability" : "PP", 18 "ligand_binding" : "LI", 19 "active_site" : "AS", 20 "intron" : "IN"} 21 #Following dictionary deliberately does not cover AC, DE or DR 22 pfam_gs_mapping = {"organism" : "OS", 23 "organism_classification" : "OC", 24 "look" : "LO"} 25
26 - def write_alignment(self, alignment) :
27 """Use this to write (another) single alignment to an open file. 28 29 Note that sequences and their annotation are recorded 30 together (rather than having a block of annotation followed 31 by a block of aligned sequences). 32 """ 33 records = alignment.get_all_seqs() 34 count = len(records) 35 36 self._length_of_sequences = alignment.get_alignment_length() 37 self._ids_written = [] 38 39 #NOTE - For now, the alignment object does not hold any per column 40 #or per alignment annotation - only per sequence. 41 42 if count == 0 : 43 raise ValueError("Must have at least one sequence") 44 if self._length_of_sequences == 0 : 45 raise ValueError("Non-empty sequences are required") 46 47 self.handle.write("# STOCKHOLM 1.0\n") 48 self.handle.write("#=GF SQ %i\n" % count) 49 for record in records : 50 self._write_record(record) 51 self.handle.write("//\n")
52
53 - def _write_record(self, record):
54 """Write a single SeqRecord to the file""" 55 if self._length_of_sequences <> len(record.seq) : 56 raise ValueError("Sequences must all be the same length") 57 58 #For the case for stockholm to stockholm, try and use record.name 59 seq_name = record.id 60 if record.name is not None : 61 if "accession" in record.annotations : 62 if record.id == record.annotations["accession"] : 63 seq_name = record.name 64 65 #In the Stockholm file format, spaces are not allowed in the id 66 seq_name = seq_name.replace(" ","_") 67 68 if "start" in record.annotations \ 69 and "end" in record.annotations : 70 suffix = "/%s-%s" % (str(record.annotations["start"]), 71 str(record.annotations["end"])) 72 if seq_name[-len(suffix):] <> suffix : 73 seq_name = "%s/%s-%s" % (seq_name, 74 str(record.annotations["start"]), 75 str(record.annotations["end"])) 76 77 if seq_name in self._ids_written : 78 raise ValueError("Duplicate record identifier: %s" % seq_name) 79 self._ids_written.append(seq_name) 80 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring())) 81 82 #The recommended placement for GS lines (per sequence annotation) 83 #is above the alignment (as a header block) or just below the 84 #corresponding sequence. 85 # 86 #The recommended placement for GR lines (per sequence per column 87 #annotation such as secondary structure) is just below the 88 #corresponding sequence. 89 # 90 #We put both just below the corresponding sequence as this allows 91 #us to write the file using a single pass through the records. 92 93 #AC = Accession 94 if "accession" in record.annotations : 95 self.handle.write("#=GS %s AC %s\n" \ 96 % (seq_name, self.clean(record.annotations["accession"]))) 97 elif record.id : 98 self.handle.write("#=GS %s AC %s\n" \ 99 % (seq_name, self.clean(record.id))) 100 101 #DE = description 102 if record.description : 103 self.handle.write("#=GS %s DE %s\n" \ 104 % (seq_name, self.clean(record.description))) 105 106 #DE = database links 107 for xref in record.dbxrefs : 108 self.handle.write("#=GS %s DR %s\n" \ 109 % (seq_name, self.clean(xref))) 110 111 #GS/GR = other per sequence annotation 112 for key in record.annotations : 113 if key in self.pfam_gs_mapping : 114 self.handle.write("#=GS %s %s %s\n" \ 115 % (seq_name, 116 self.clean(self.pfam_gs_mapping[key]), 117 self.clean(str(record.annotations[key])))) 118 elif key in self.pfam_gr_mapping : 119 if len(str(record.annotations[key]))==len(record.seq) : 120 self.handle.write("#=GR %s %s %s\n" \ 121 % (seq_name, 122 self.clean(self.pfam_gr_mapping[key]), 123 self.clean((record.annotations[key])))) 124 else : 125 #Should we print a warning? 126 pass 127 else : 128 #It doesn't follow the PFAM standards, but should we record 129 #this data anyway? 130 pass
131
132 -class StockholmIterator(AlignmentIterator) :
133 """Loads a Stockholm file from PFAM into Alignment objects. 134 135 The file may contain multiple concatenated alignments, which are loaded 136 and returned incrementally. 137 138 This parser will detect if the Stockholm file follows the PFAM 139 conventions for sequence specific meta-data (lines starting #=GS 140 and #=GR) and populates the SeqRecord fields accordingly. 141 142 Any annotation which does not follow the PFAM conventions is currently 143 ignored. 144 145 If an accession is provided for an entry in the meta data, IT WILL NOT 146 be used as the record.id (it will be recorded in the record's 147 annotations). This is because some files have (sub) sequences from 148 different parts of the same accession (differentiated by different 149 start-end positions). 150 151 Wrap-around alignments are not supported - each sequences must be on 152 a single line. However, interlaced sequences should work. 153 154 For more information on the file format, please see: 155 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format 156 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html 157 158 For consistency with BioPerl and EMBOSS we call this the "stockholm" 159 format. 160 """ 161 162 #These dictionaries should be kept in sync with those 163 #defined in the PfamStockholmWriter class. 164 pfam_gr_mapping = {"SS" : "secondary_structure", 165 "SA" : "surface_accessibility", 166 "TM" : "transmembrane", 167 "PP" : "posterior_probability", 168 "LI" : "ligand_binding", 169 "AS" : "active_site", 170 "IN" : "intron"} 171 #Following dictionary deliberately does not cover AC, DE or DR 172 pfam_gs_mapping = {"OS" : "organism", 173 "OC" : "organism_classification", 174 "LO" : "look"} 175
176 - def next(self) :
177 try : 178 line = self._header 179 del self._header 180 except AttributeError : 181 line = self.handle.readline() 182 if not line: 183 #Empty file - just give up. 184 return 185 if not line.strip() == '# STOCKHOLM 1.0': 186 raise ValueError("Did not find STOCKHOLM header") 187 #import sys 188 #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' 189 190 # Note: If this file follows the PFAM conventions, there should be 191 # a line containing the number of sequences, e.g. "#=GF SQ 67" 192 # We do not check for this - perhaps we should, and verify that 193 # if present it agrees with our parsing. 194 195 seqs = {} 196 ids = [] 197 gs = {} 198 gr = {} 199 gf = {} 200 passed_end_alignment = False 201 while 1: 202 line = self.handle.readline() 203 if not line: break #end of file 204 line = line.strip() #remove trailing \n 205 if line == '# STOCKHOLM 1.0': 206 self._header = line 207 break 208 elif line == "//" : 209 #The "//" line indicates the end of the alignment. 210 #There may still be more meta-data 211 passed_end_alignment = True 212 elif line == "" : 213 #blank line, ignore 214 pass 215 elif line[0] <> "#" : 216 #Sequence 217 #Format: "<seqname> <sequence>" 218 assert not passed_end_alignment 219 parts = [x.strip() for x in line.split(" ",1)] 220 if len(parts) <> 2 : 221 #This might be someone attempting to store a zero length sequence? 222 raise ValueError("Could not split line into identifier " \ 223 + "and sequence:\n" + line) 224 id, seq = parts 225 if id not in ids : 226 ids.append(id) 227 seqs.setdefault(id, '') 228 seqs[id] += seq.replace(".","-") 229 elif len(line) >= 5 : 230 #Comment line or meta-data 231 if line[:5] == "#=GF " : 232 #Generic per-File annotation, free text 233 #Format: #=GF <feature> <free text> 234 feature, text = line[5:].strip().split(None,1) 235 #Each feature key could be used more than once, 236 #so store the entries as a list of strings. 237 if feature not in gf : 238 gf[feature] = [text] 239 else : 240 gf[feature].append(text) 241 elif line[:5] == '#=GC ': 242 #Generic per-Column annotation, exactly 1 char per column 243 #Format: "#=GC <feature> <exactly 1 char per column>" 244 pass 245 elif line[:5] == '#=GS ': 246 #Generic per-Sequence annotation, free text 247 #Format: "#=GS <seqname> <feature> <free text>" 248 id, feature, text = line[5:].strip().split(None,2) 249 #if id not in ids : 250 # ids.append(id) 251 if id not in gs : 252 gs[id] = {} 253 if feature not in gs[id] : 254 gs[id][feature] = [text] 255 else : 256 gs[id][feature].append(text) 257 elif line[:5] == "#=GR " : 258 #Generic per-Sequence AND per-Column markup 259 #Format: "#=GR <seqname> <feature> <exactly 1 char per column>" 260 id, feature, text = line[5:].strip().split(None,2) 261 #if id not in ids : 262 # ids.append(id) 263 if id not in gr : 264 gr[id] = {} 265 if feature not in gr[id]: 266 gr[id][feature] = "" 267 gr[id][feature] += text.strip() # append to any previous entry 268 #TODO - Should we check the length matches the alignment length? 269 # For iterlaced sequences the GR data can be split over 270 # multiple lines 271 #Next line... 272 273 274 assert len(seqs) <= len(ids) 275 #assert len(gs) <= len(ids) 276 #assert len(gr) <= len(ids) 277 278 self.ids = ids 279 self.sequences = seqs 280 self.seq_annotation = gs 281 self.seq_col_annotation = gr 282 283 if ids and seqs : 284 285 if self.records_per_alignment is not None \ 286 and self.records_per_alignment <> len(ids) : 287 raise ValueError("Found %i records in this alignment, told to expect %i" \ 288 % (len(ids), self.records_per_alignment)) 289 290 alignment = Alignment(self.alphabet) 291 292 #TODO - Introduce an annotated alignment class? 293 #For now, store the annotation a new private property: 294 alignment._annotations = gr 295 296 alignment_length = len(seqs.values()[0]) 297 for id in ids : 298 seq = seqs[id] 299 if alignment_length <> len(seq) : 300 raise ValueError("Sequences have different lengths, or repeated identifier") 301 name, start, end = self._identifier_split(id) 302 alignment.add_sequence(id, seq, start=start, end=end) 303 304 record = alignment.get_all_seqs()[-1] 305 306 assert record.id == id or record.description == id 307 308 record.id = id 309 record.name = name 310 record.description = id 311 312 #will be overridden by _populate_meta_data if an explicit 313 #accession is provided: 314 record.annotations["accession"]=name 315 316 self._populate_meta_data(id, record) 317 return alignment 318 else : 319 return None
320 321
322 - def _identifier_split(self, identifier) :
323 """Returns (name,start,end) string tuple from an identier.""" 324 if identifier.find("/")<>-1 : 325 start_end = identifier.split("/",1)[1] 326 if start_end.count("-")==1 : 327 start, end = map(int, start_end.split("-")) 328 name = identifier.split("/",1)[0] 329 return (name, start, end) 330 return (identifier, None, None)
331
332 - def _get_meta_data(self, identifier, meta_dict) :
333 """Takes an itentifier and returns dict of all meta-data matching it. 334 335 For example, given "Q9PN73_CAMJE/149-220" will return all matches to 336 this or "Q9PN73_CAMJE" which the identifier without its /start-end 337 suffix. 338 339 In the example below, the suffix is required to match the AC, but must 340 be removed to match the OS and OC meta-data. 341 342 # STOCKHOLM 1.0 343 #=GS Q9PN73_CAMJE/149-220 AC Q9PN73 344 ... 345 Q9PN73_CAMJE/149-220 NKA... 346 ... 347 #=GS Q9PN73_CAMJE OS Campylobacter jejuni 348 #=GS Q9PN73_CAMJE OC Bacteria 349 350 This function will return an empty dictionary if no data is found.""" 351 name, start, end = self._identifier_split(identifier) 352 if name==identifier : 353 identifier_keys = [identifier] 354 else : 355 identifier_keys = [identifier, name] 356 answer = {} 357 for identifier_key in identifier_keys : 358 try : 359 for feature_key in meta_dict[identifier_key] : 360 answer[feature_key] = meta_dict[identifier_key][feature_key] 361 except KeyError : 362 pass 363 return answer
364
365 - def _populate_meta_data(self, identifier, record) :
366 """Adds meta-date to a SecRecord's annotations dictionary. 367 368 This function applies the PFAM conventions.""" 369 370 seq_data = self._get_meta_data(identifier, self.seq_annotation) 371 for feature in seq_data : 372 #Note this dictionary contains lists! 373 if feature=="AC" : #ACcession number 374 assert len(seq_data[feature])==1 375 record.annotations["accession"]=seq_data[feature][0] 376 elif feature=="DE" : #DEscription 377 record.description = "\n".join(seq_data[feature]) 378 elif feature=="DR" : #Database Reference 379 #Should we try and parse the strings? 380 record.dbxrefs = seq_data[feature] 381 elif feature in self.pfam_gs_mapping : 382 record.annotations[self.pfam_gs_mapping[feature]] = ", ".join(seq_data[feature]) 383 else : 384 #Ignore it? 385 record.annotations["GS:" + feature] = ", ".join(seq_data[feature]) 386 387 seq_col_data = self._get_meta_data(identifier, self.seq_col_annotation) 388 for feature in seq_col_data : 389 #Note this dictionary contains strings! 390 if feature in self.pfam_gr_mapping : 391 record.annotations[self.pfam_gr_mapping[feature]] = seq_col_data[feature] 392 else : 393 #Ignore it? 394 record.annotations["GR:" + feature] = seq_col_data[feature]
395 396 if __name__ == "__main__" : 397 print "Testing..." 398 from cStringIO import StringIO 399 400 # This example with its slightly odd (partial) annotation is from here: 401 # http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html 402 # I don't know what the "GR_O31699/88-139_IN ..." line is meant to be. 403 sth_example = \ 404 """# STOCKHOLM 1.0 405 #=GF ID CBS 406 #=GF AC PF00571 407 #=GF DE CBS domain 408 #=GF AU Bateman A 409 #=GF CC CBS domains are small intracellular modules mostly found 410 #=GF CC in 2 or four copies within a protein. 411 #=GF SQ 67 412 #=GS O31698/18-71 AC O31698 413 #=GS O83071/192-246 AC O83071 414 #=GS O83071/259-312 AC O83071 415 #=GS O31698/88-139 AC O31698 416 #=GS O31698/88-139 OS Bacillus subtilis 417 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS 418 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777 419 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY 420 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE 421 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS 422 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH 423 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE 424 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH 425 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH 426 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE 427 #=GR O31699/88-139 AS ________________*__________________________ 428 #=GR_O31699/88-139_IN ____________1______________2__________0____ 429 // 430 """ 431 432 # Interlaced example from BioPerl documentation. Also note the blank line. 433 # http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format 434 sth_example2 = \ 435 """# STOCKHOLM 1.0 436 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>.. 437 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU 438 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..-- 439 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU 440 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>---- 441 442 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>............... 443 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU 444 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>--------------- 445 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU 446 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>--------------- 447 //""" 448 449 print "--------" 450 print "StockholmIterator(stockholm alignment file)" 451 iterator = StockholmIterator(StringIO(sth_example)) 452 count=0 453 for alignment in iterator : 454 for record in alignment.get_all_seqs() : 455 count=count+1 456 assert count == 5 457 458 #Check the last record... 459 assert record.id == 'O31699/88-139' 460 assert record.name == 'O31699' 461 assert record.description == 'O31699/88-139' 462 assert len(record.annotations)==4+1 #weight 463 assert record.annotations["accession"]=='O31699' 464 assert record.annotations["start"]==88 465 assert record.annotations["end"]==139 466 assert record.annotations["active_site"]=='________________*__________________________' 467 468 iterator = StockholmIterator(StringIO(sth_example)) 469 count=0 470 for alignment in iterator : 471 for record in alignment.get_all_seqs() : 472 count=count+1 473 break 474 break 475 assert count==1 476 #Check the last record... 477 assert record.id == 'O83071/192-246' 478 assert record.name == 'O83071' 479 assert record.description == 'O83071/192-246' 480 assert len(record.annotations)==4 + 1#weight 481 assert record.annotations["accession"]=='O83071' 482 assert record.annotations["start"]==192 483 assert record.annotations["end"]==246 484 assert record.annotations["surface_accessibility"]=="999887756453524252..55152525....36463774777" 485 486 assert [[r.id for r in a.get_all_seqs()] \ 487 for a in StockholmIterator(StringIO(sth_example))] \ 488 == [['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']] 489 490 assert [[r.name for r in a.get_all_seqs()] \ 491 for a in StockholmIterator(StringIO(sth_example))] \ 492 == [['O83071', 'O83071', 'O31698', 'O31698', 'O31699']] 493 494 assert [[r.description for r in a.get_all_seqs()] \ 495 for a in StockholmIterator(StringIO(sth_example))] \ 496 == [['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']] 497 498 499 print "--------" 500 print "StockholmIterator(interlaced stockholm alignment file)" 501 iterator = iter(StockholmIterator(StringIO(sth_example2)).next().get_all_seqs()) 502 record = iterator.next() 503 assert record.id == "AP001509.1" 504 assert len(record.seq) == 104 505 assert "secondary_structure" in record.annotations 506 assert len(record.annotations["secondary_structure"]) == 104 507 record = iterator.next() 508 assert record.id == "AE007476.1" 509 assert len(record.seq) == 104 510 assert "secondary_structure" in record.annotations 511 assert len(record.annotations["secondary_structure"]) == 104 512 try : 513 record = iterator.next() 514 except StopIteration : 515 record = None 516 assert record is None 517 518 print "--------" 519 print "StockholmIterator(concatenated stockholm alignment files)" 520 assert 2 == len(list(StockholmIterator(StringIO(sth_example + sth_example2)))) 521 assert 2 == len(list(StockholmIterator(StringIO(sth_example + "\n" + sth_example2)))) 522 assert 2 == len(list(StockholmIterator(StringIO(sth_example + "\n\n" + sth_example2)))) 523 assert 2 == len(list(StockholmIterator(StringIO(sth_example2 + "\n\n" + sth_example)))) 524 assert 2 == len(list(StockholmIterator(StringIO(sth_example2 + "\n" + sth_example)))) 525 526 print "--------" 527 print "Checking write/read" 528 handle = StringIO() 529 list1 = list(StockholmIterator(StringIO(sth_example2 + "\n" + sth_example))) 530 StockholmWriter(handle).write_file(list1) 531 handle.seek(0) 532 list2 = list(StockholmIterator(handle)) 533 assert len(list1) == len(list2) 534 for a1,a2 in zip(list1, list2) : 535 assert len(a1.get_all_seqs()) == len(a2.get_all_seqs()) 536 for r1, r2 in zip(a1.get_all_seqs(), a2.get_all_seqs()) : 537 assert r1.id == r2.id 538 assert r1.seq.tostring() == r2.seq.tostring() 539 print "Done" 540