1
2
3
4
5
6
7 """Bio.SeqIO support for the "stockholm" (aka PFAM) file format.
8
9 You were expected to use this module via the Bio.SeqIO functions.
10 This module has now been replaced by Bio.AlignIO.PhylipIO, and is
11 deprecated."""
12
13 import warnings
14 warnings.warn("Bio.SeqIO.StockholmIO is deprecated. You can continue to read" \
15 + " and write 'clustal' files with Bio.SeqIO, but this is now" \
16 + " handled via Bio.AlignIO internally.",
17 DeprecationWarning)
18
19 from Bio.Alphabet import single_letter_alphabet
20 from Bio.Seq import Seq
21 from Bio.SeqRecord import SeqRecord
22 from Interfaces import InterlacedSequenceIterator, SequentialSequenceWriter
23
25 """Loads a Stockholm file from PFAM into SeqRecord objects.
26
27 The entire file is loaded, and any sequence can be accessed using
28 the [index] notation.
29
30 This parser will detect if the Stockholm file follows the PFAM conventions
31 for sequence specific meta-data (lines starting #=GS and #=GR) and
32 populates the SeqRecord fields accordingly.
33
34 Any annotation which does not follow the PFAM conventions is currently
35 ignored.
36
37 If an accession is provided for an entry in the meta data, IT WILL NOT
38 be used as the record.id (it will be recorded in the record's annotations).
39 This is because some files have (sub) sequences from different parts of
40 the same accession (differentiated by different start-end positions).
41
42 Wrap-around alignments are not supported - each sequences must be on
43 a single line. However, interlaced sequences should work.
44
45 For more information on the file format, please see:
46 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format
47 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
48
49 For consistency with BioPerl and EMBOSS we call this the "stockholm"
50 format.
51 """
52
53
54
55 pfam_gr_mapping = {"SS" : "secondary_structure",
56 "SA" : "surface_accessibility",
57 "TM" : "transmembrane",
58 "PP" : "posterior_probability",
59 "LI" : "ligand_binding",
60 "AS" : "active_site",
61 "IN" : "intron"}
62
63 pfam_gs_mapping = {"OS" : "organism",
64 "OC" : "organism_classification",
65 "LO" : "look"}
66
67
81
83 line = self.handle.readline()
84 if not line:
85
86 return
87 if not line.strip() == '# STOCKHOLM 1.0':
88 raise ValueError("Did not find STOCKHOLM header")
89
90
91
92
93
94
95
96
97 seqs = {}
98 ids = []
99 gs = {}
100 gr = {}
101 passed_end_alignment = False
102 while 1:
103 line = self.handle.readline()
104 if not line: break
105 line = line.strip()
106 if line == "//" :
107
108
109 passed_end_alignment = True
110 elif line == "" :
111
112 pass
113 elif line[0] <> "#" :
114
115
116 assert not passed_end_alignment
117 parts = [x.strip() for x in line.split(" ",1)]
118 if len(parts) <> 2 :
119
120 raise ValueError("Could not split line into identifier " \
121 + "and sequence:\n" + line)
122 id, seq = parts
123 if id not in ids :
124 ids.append(id)
125 seqs.setdefault(id, '')
126 seqs[id] += seq.replace(".","-")
127 elif len(line) >= 5 :
128
129 if line[:5] == "#=GF " :
130
131
132 pass
133 elif line[:5] == '#=GC ':
134
135
136 pass
137 elif line[:5] == '#=GS ':
138
139
140 id, feature, text = line[5:].strip().split(None,2)
141
142
143 if id not in gs :
144 gs[id] = {}
145 if feature not in gs[id] :
146 gs[id][feature] = [text]
147 else :
148 gs[id][feature].append(text)
149 elif line[:5] == "#=GR " :
150
151
152 id, feature, text = line[5:].strip().split(None,2)
153
154
155 if id not in gr :
156 gr[id] = {}
157 if feature not in gr[id]:
158 gr[id][feature] = ""
159 gr[id][feature] += text.strip()
160
161
162
163
164
165
166 assert len(seqs) <= len(ids)
167
168
169
170
171 if ids and seqs :
172 align_len = None
173 for id, seq in seqs.iteritems() :
174 assert id in ids
175 if align_len is None :
176 align_len = len(seq)
177 elif align_len <> len(seq) :
178 raise ValueError("Sequences have different lengths, or repeated identifier")
179
180
181 self.ids = ids
182 self.sequences = seqs
183 self.seq_annotation = gs
184 self.seq_col_annotation = gr
185 self.move_start()
186
189
199
232
263
284
286 """Class to write PFAM style Stockholm format files.
287
288 Note that sequences and their annotation are recorded
289 together (rather than having a block of annotation followed
290 by a block of aligned sequences).
291 """
292
293
294
295 pfam_gr_mapping = { "secondary_structure" : "SS",
296 "surface_accessibility" : "SA",
297 "transmembrane" : "TM",
298 "posterior_probability" : "PP",
299 "ligand_binding" : "LI",
300 "active_site" : "AS",
301 "intron" : "IN"}
302
303 pfam_gs_mapping = {"organism" : "OS",
304 "organism_classification" : "OC",
305 "look" : "LO"}
306
308 """Creates the writer object.
309
310 Use the method write_file() to actually record your sequence records."""
311 SequentialSequenceWriter.__init__(self, handle)
312 self._ids_written = []
313 self._length_of_sequences = None
314
320
322 """Use this to write an entire file containing the given records.
323
324 records - A list or iterator returning SeqRecord objects.
325 If len(records) is not supported, then it will be
326 converted into a list.
327
328 This method should only be called once for each file."""
329 try :
330
331
332 count = len(records)
333 except TypeError :
334
335 records = list(records)
336 count = len(records)
337
338 if count == 0 :
339 raise ValueError("Must have at least one sequence")
340
341 self.write_header(count)
342 self.write_records(records)
343 self.write_footer()
344
345
346
347
348
350 """Write a single Stockholm record to the file."""
351 assert self._header_written
352 assert not self._footer_written
353 self._record_written = True
354
355 if self._length_of_sequences is None :
356 self._length_of_sequences = len(record.seq)
357 elif self._length_of_sequences <> len(record.seq) :
358 raise ValueError("Sequences must all be the same length")
359
360 if self._length_of_sequences == 0 :
361 raise ValueError("Non-empty sequences are required")
362
363
364 seq_name = record.id
365 if record.name is not None :
366 if "accession" in record.annotations :
367 if record.id == record.annotations["accession"] :
368 seq_name = record.name
369
370
371 seq_name = seq_name.replace(" ","_")
372
373 if "start" in record.annotations \
374 and "end" in record.annotations :
375 suffix = "/%s-%s" % (str(record.annotations["start"]),
376 str(record.annotations["end"]))
377 if seq_name[-len(suffix):] <> suffix :
378 seq_name = "%s/%s-%s" % (seq_name,
379 str(record.annotations["start"]),
380 str(record.annotations["end"]))
381
382 if seq_name in self._ids_written :
383 raise ValueError("Duplicate record identifier: %s" % seq_name)
384 self._ids_written.append(seq_name)
385 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring()))
386
387
388
389
390
391
392
393
394
395
396
397
398
399 if "accession" in record.annotations :
400 self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.annotations["accession"])))
401 elif record.id :
402 self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.id)))
403
404
405 if record.description :
406 self.handle.write("#=GS %s DE %s\n" % (seq_name, self.clean(record.description)))
407
408
409 for xref in record.dbxrefs :
410 self.handle.write("#=GS %s DR %s\n" % (seq_name, self.clean(xref)))
411
412
413 for key in record.annotations :
414 if key in self.pfam_gs_mapping :
415 self.handle.write("#=GS %s %s %s\n" \
416 % (seq_name,
417 self.clean(self.pfam_gs_mapping[key]),
418 self.clean(str(record.annotations[key]))))
419 elif key in self.pfam_gr_mapping :
420 if len(str(record.annotations[key]))==len(record.seq) :
421 self.handle.write("#=GR %s %s %s\n" \
422 % (seq_name,
423 self.clean(self.pfam_gr_mapping[key]),
424 self.clean((record.annotations[key]))))
425 else :
426
427 pass
428 else :
429
430 pass
431
432
433
434
446
447 if __name__ == "__main__" :
448 print "Testing..."
449 from cStringIO import StringIO
450
451
452
453
454
455 sth_example = \
456 """# STOCKHOLM 1.0
457 #=GF ID CBS
458 #=GF AC PF00571
459 #=GF DE CBS domain
460 #=GF AU Bateman A
461 #=GF CC CBS domains are small intracellular modules mostly found
462 #=GF CC in 2 or four copies within a protein.
463 #=GF SQ 67
464 #=GS O31698/18-71 AC O31698
465 #=GS O83071/192-246 AC O83071
466 #=GS O83071/259-312 AC O83071
467 #=GS O31698/88-139 AC O31698
468 #=GS O31698/88-139 OS Bacillus subtilis
469 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
470 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777
471 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY
472 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE
473 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS
474 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH
475 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
476 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
477 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
478 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
479 #=GR O31699/88-139 AS ________________*__________________________
480 #=GR_O31699/88-139_IN ____________1______________2__________0____
481 //
482 """
483
484
485
486 sth_example2 = \
487 """# STOCKHOLM 1.0
488 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
489 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
490 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
491 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
492 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
493
494 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
495 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
496 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
497 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
498 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
499 //"""
500
501 print "--------"
502 print "StockholmIterator(stockholm alignment file)"
503 iterator = StockholmIterator(StringIO(sth_example))
504 count=0
505 for record in iterator :
506 count=count+1
507 assert count == 5
508
509 assert record.id == 'O31699/88-139'
510 assert record.name == 'O31699'
511 assert record.description == 'O31699/88-139'
512 assert len(record.annotations)==4
513 assert record.annotations["accession"]=='O31699'
514 assert record.annotations["start"]==88
515 assert record.annotations["end"]==139
516 assert record.annotations["active_site"]=='________________*__________________________'
517
518 iterator = StockholmIterator(StringIO(sth_example))
519 count=0
520 for record in iterator :
521 count=count+1
522 break
523 assert count==1
524
525 assert record.id == 'O83071/192-246'
526 assert record.name == 'O83071'
527 assert record.description == 'O83071/192-246'
528 assert len(record.annotations)==4
529 assert record.annotations["accession"]=='O83071'
530 assert record.annotations["start"]==192
531 assert record.annotations["end"]==246
532 assert record.annotations["surface_accessibility"]=="999887756453524252..55152525....36463774777"
533
534 assert [r.id for r in StockholmIterator(StringIO(sth_example))] \
535 == ['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']
536
537 assert [r.name for r in StockholmIterator(StringIO(sth_example))] \
538 == ['O83071', 'O83071', 'O31698', 'O31698', 'O31699']
539
540 assert [r.description for r in StockholmIterator(StringIO(sth_example))] \
541 == ['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']
542
543
544 print "--------"
545 print "StockholmIterator(interlaced stockholm alignment file)"
546 iterator = StockholmIterator(StringIO(sth_example2))
547 record = iterator.next()
548 assert record.id == "AP001509.1"
549 assert len(record.seq) == 104
550 assert "secondary_structure" in record.annotations
551 assert len(record.annotations["secondary_structure"]) == 104
552 record = iterator.next()
553 assert record.id == "AE007476.1"
554 assert len(record.seq) == 104
555 assert "secondary_structure" in record.annotations
556 assert len(record.annotations["secondary_structure"]) == 104
557 record = iterator.next()
558 assert record is None
559