1
2
3
4
5 """
6 Bio.AlignIO support for the "stockholm" format (used in the PFAM database).
7
8 You are expected to use this module via the Bio.AlignIO functions (or the
9 Bio.SeqIO functions if you want to work directly with the gapped sequences).
10
11 For example, consider a Stockholm alignment file containing the following::
12
13 # STOCKHOLM 1.0
14 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
15 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
16 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
17 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
18 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
19
20 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
21 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
22 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
23 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
24 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
25 //
26
27 This is a single multiple sequence alignment, so you would probably load this
28 using the Bio.AlignIO.read() function:
29
30 >>> from Bio import AlignIO
31 >>> handle = open("Stockholm/simple.sth", "rU")
32 >>> align = AlignIO.read(handle, "stockholm")
33 >>> handle.close()
34 >>> print align
35 SingleLetterAlphabet() alignment with 2 rows and 104 columns
36 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
37 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
38 >>> for record in align :
39 ... print record.id, len(record)
40 AP001509.1 104
41 AE007476.1 104
42
43 This example file is clearly using RNA, so you might want the Alignment object
44 (and the SeqRecord objects it holds) to reflect this, rather than simple using
45 the default single letter alphabet as shown above. You can do this with an
46 optional argument to the Bio.AlignIO.read() function:
47
48 >>> from Bio import AlignIO
49 >>> from Bio.Alphabet import generic_rna
50 >>> handle = open("Stockholm/simple.sth", "rU")
51 >>> align = AlignIO.read(handle, "stockholm", alphabet=generic_rna)
52 >>> handle.close()
53 >>> print align
54 RNAAlphabet() alignment with 2 rows and 104 columns
55 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
56 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
57
58 In addition to the sequences themselves, this example alignment also includes
59 some GR lines for the secondary structure of the sequences. These are strings,
60 with one character for each letter in the associated sequence:
61
62 >>> for record in align :
63 ... print record.id
64 ... print record.seq
65 ... print record.letter_annotations['secondary_structure']
66 AP001509.1
67 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
68 -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
69 AE007476.1
70 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
71 -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
72
73 Any general annotation for each row is recorded in the SeqRecord's annotations
74 dictionary. You can output this alignment in many different file formats using
75 Bio.AlignIO.write(), or the Alignment object's format method:
76
77 >>> print align.format("fasta")
78 >AP001509.1
79 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-A
80 GGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
81 >AE007476.1
82 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAA
83 GGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
84 <BLANKLINE>
85
86 Most output formats won't be able to hold the annotation possible in a Stockholm file:
87
88 >>> print align.format("stockholm")
89 # STOCKHOLM 1.0
90 #=GF SQ 2
91 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
92 #=GS AP001509.1 AC AP001509.1
93 #=GS AP001509.1 DE AP001509.1
94 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
95 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
96 #=GS AE007476.1 AC AE007476.1
97 #=GS AE007476.1 DE AE007476.1
98 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
99 //
100 <BLANKLINE>
101
102 Note that when writing Stockholm files, Biopython does not break long sequences up and
103 interleave them (as in the input file shown above). The standard allows this simpler
104 layout, and it is more likely to be understood by other tools.
105
106 Finally, as an aside, it can sometimes be useful to use Bio.SeqIO.parse() to iterate over
107 the two rows as SeqRecord objects - rather than working with Alignnment objects.
108 Again, if you want to you can specify this is RNA:
109
110 >>> from Bio import SeqIO
111 >>> from Bio.Alphabet import generic_rna
112 >>> handle = open("Stockholm/simple.sth", "rU")
113 >>> for record in SeqIO.parse(handle, "stockholm", alphabet=generic_rna) :
114 ... print record.id
115 ... print record.seq
116 ... print record.letter_annotations['secondary_structure']
117 AP001509.1
118 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
119 -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
120 AE007476.1
121 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
122 -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
123 >>> handle.close()
124
125 Remember that if you slice a SeqRecord, the per-letter-annotions like the
126 secondary structure string here, are also sliced:
127
128 >>> sub_record = record[10:20]
129 >>> print sub_record.seq
130 AUCGUUUUAC
131 >>> print sub_record.letter_annotations['secondary_structure']
132 -------<<<
133 """
134 __docformat__ = "epytext en"
135 from Bio.Align.Generic import Alignment
136 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
137
139 """Stockholm/PFAM alignment writer."""
140
141
142
143 pfam_gr_mapping = {"secondary_structure" : "SS",
144 "surface_accessibility" : "SA",
145 "transmembrane" : "TM",
146 "posterior_probability" : "PP",
147 "ligand_binding" : "LI",
148 "active_site" : "AS",
149 "intron" : "IN"}
150
151 pfam_gs_mapping = {"organism" : "OS",
152 "organism_classification" : "OC",
153 "look" : "LO"}
154
156 """Use this to write (another) single alignment to an open file.
157
158 Note that sequences and their annotation are recorded
159 together (rather than having a block of annotation followed
160 by a block of aligned sequences).
161 """
162 records = alignment.get_all_seqs()
163 count = len(records)
164
165 self._length_of_sequences = alignment.get_alignment_length()
166 self._ids_written = []
167
168
169
170
171 if count == 0 :
172 raise ValueError("Must have at least one sequence")
173 if self._length_of_sequences == 0 :
174 raise ValueError("Non-empty sequences are required")
175
176 self.handle.write("# STOCKHOLM 1.0\n")
177 self.handle.write("#=GF SQ %i\n" % count)
178 for record in records :
179 self._write_record(record)
180 self.handle.write("//\n")
181
183 """Write a single SeqRecord to the file"""
184 if self._length_of_sequences != len(record.seq) :
185 raise ValueError("Sequences must all be the same length")
186
187
188 seq_name = record.id
189 if record.name is not None :
190 if "accession" in record.annotations :
191 if record.id == record.annotations["accession"] :
192 seq_name = record.name
193
194
195 seq_name = seq_name.replace(" ","_")
196
197 if "start" in record.annotations \
198 and "end" in record.annotations :
199 suffix = "/%s-%s" % (str(record.annotations["start"]),
200 str(record.annotations["end"]))
201 if seq_name[-len(suffix):] != suffix :
202 seq_name = "%s/%s-%s" % (seq_name,
203 str(record.annotations["start"]),
204 str(record.annotations["end"]))
205
206 if seq_name in self._ids_written :
207 raise ValueError("Duplicate record identifier: %s" % seq_name)
208 self._ids_written.append(seq_name)
209 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring()))
210
211
212
213
214
215
216
217
218
219
220
221
222
223 if "accession" in record.annotations :
224 self.handle.write("#=GS %s AC %s\n" \
225 % (seq_name, self.clean(record.annotations["accession"])))
226 elif record.id :
227 self.handle.write("#=GS %s AC %s\n" \
228 % (seq_name, self.clean(record.id)))
229
230
231 if record.description :
232 self.handle.write("#=GS %s DE %s\n" \
233 % (seq_name, self.clean(record.description)))
234
235
236 for xref in record.dbxrefs :
237 self.handle.write("#=GS %s DR %s\n" \
238 % (seq_name, self.clean(xref)))
239
240
241 for key, value in record.annotations.iteritems() :
242 if key in self.pfam_gs_mapping :
243 data = self.clean(str(value))
244 if data :
245 self.handle.write("#=GS %s %s %s\n" \
246 % (seq_name,
247 self.clean(self.pfam_gs_mapping[key]),
248 data))
249 else :
250
251
252 pass
253
254
255 for key, value in record.letter_annotations.iteritems() :
256 if key in self.pfam_gr_mapping and len(str(value))==len(record.seq) :
257 data = self.clean(str(value))
258 if data :
259 self.handle.write("#=GR %s %s %s\n" \
260 % (seq_name,
261 self.clean(self.pfam_gr_mapping[key]),
262 data))
263 else :
264
265
266 pass
267
269 """Loads a Stockholm file from PFAM into Alignment objects.
270
271 The file may contain multiple concatenated alignments, which are loaded
272 and returned incrementally.
273
274 This parser will detect if the Stockholm file follows the PFAM
275 conventions for sequence specific meta-data (lines starting #=GS
276 and #=GR) and populates the SeqRecord fields accordingly.
277
278 Any annotation which does not follow the PFAM conventions is currently
279 ignored.
280
281 If an accession is provided for an entry in the meta data, IT WILL NOT
282 be used as the record.id (it will be recorded in the record's
283 annotations). This is because some files have (sub) sequences from
284 different parts of the same accession (differentiated by different
285 start-end positions).
286
287 Wrap-around alignments are not supported - each sequences must be on
288 a single line. However, interlaced sequences should work.
289
290 For more information on the file format, please see:
291 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format
292 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
293
294 For consistency with BioPerl and EMBOSS we call this the "stockholm"
295 format.
296 """
297
298
299
300 pfam_gr_mapping = {"SS" : "secondary_structure",
301 "SA" : "surface_accessibility",
302 "TM" : "transmembrane",
303 "PP" : "posterior_probability",
304 "LI" : "ligand_binding",
305 "AS" : "active_site",
306 "IN" : "intron"}
307
308 pfam_gs_mapping = {"OS" : "organism",
309 "OC" : "organism_classification",
310 "LO" : "look"}
311
313 try :
314 line = self._header
315 del self._header
316 except AttributeError :
317 line = self.handle.readline()
318 if not line:
319
320 return
321 if not line.strip() == '# STOCKHOLM 1.0':
322 raise ValueError("Did not find STOCKHOLM header")
323
324
325
326
327
328
329
330
331 seqs = {}
332 ids = []
333 gs = {}
334 gr = {}
335 gf = {}
336 passed_end_alignment = False
337 while 1:
338 line = self.handle.readline()
339 if not line: break
340 line = line.strip()
341 if line == '# STOCKHOLM 1.0':
342 self._header = line
343 break
344 elif line == "//" :
345
346
347 passed_end_alignment = True
348 elif line == "" :
349
350 pass
351 elif line[0] != "#" :
352
353
354 assert not passed_end_alignment
355 parts = [x.strip() for x in line.split(" ",1)]
356 if len(parts) != 2 :
357
358 raise ValueError("Could not split line into identifier " \
359 + "and sequence:\n" + line)
360 id, seq = parts
361 if id not in ids :
362 ids.append(id)
363 seqs.setdefault(id, '')
364 seqs[id] += seq.replace(".","-")
365 elif len(line) >= 5 :
366
367 if line[:5] == "#=GF " :
368
369
370 feature, text = line[5:].strip().split(None,1)
371
372
373 if feature not in gf :
374 gf[feature] = [text]
375 else :
376 gf[feature].append(text)
377 elif line[:5] == '#=GC ':
378
379
380 pass
381 elif line[:5] == '#=GS ':
382
383
384 id, feature, text = line[5:].strip().split(None,2)
385
386
387 if id not in gs :
388 gs[id] = {}
389 if feature not in gs[id] :
390 gs[id][feature] = [text]
391 else :
392 gs[id][feature].append(text)
393 elif line[:5] == "#=GR " :
394
395
396 id, feature, text = line[5:].strip().split(None,2)
397
398
399 if id not in gr :
400 gr[id] = {}
401 if feature not in gr[id]:
402 gr[id][feature] = ""
403 gr[id][feature] += text.strip()
404
405
406
407
408
409
410 assert len(seqs) <= len(ids)
411
412
413
414 self.ids = ids
415 self.sequences = seqs
416 self.seq_annotation = gs
417 self.seq_col_annotation = gr
418
419 if ids and seqs :
420
421 if self.records_per_alignment is not None \
422 and self.records_per_alignment != len(ids) :
423 raise ValueError("Found %i records in this alignment, told to expect %i" \
424 % (len(ids), self.records_per_alignment))
425
426 alignment = Alignment(self.alphabet)
427
428
429
430 alignment._annotations = gr
431
432 alignment_length = len(seqs.values()[0])
433 for id in ids :
434 seq = seqs[id]
435 if alignment_length != len(seq) :
436 raise ValueError("Sequences have different lengths, or repeated identifier")
437 name, start, end = self._identifier_split(id)
438 alignment.add_sequence(id, seq, start=start, end=end)
439
440 record = alignment.get_all_seqs()[-1]
441
442 assert record.id == id or record.description == id
443
444 record.id = id
445 record.name = name
446 record.description = id
447
448
449
450 record.annotations["accession"]=name
451
452 self._populate_meta_data(id, record)
453 return alignment
454 else :
455 return None
456
457
459 """Returns (name,start,end) string tuple from an identier."""
460 if identifier.find("/")!=-1 :
461 start_end = identifier.split("/",1)[1]
462 if start_end.count("-")==1 :
463 start, end = map(int, start_end.split("-"))
464 name = identifier.split("/",1)[0]
465 return (name, start, end)
466 return (identifier, None, None)
467
500
532
534 """Run the Bio.SeqIO module's doctests.
535
536 This will try and locate the unit tests directory, and run the doctests
537 from there in order that the relative paths used in the examples work.
538 """
539 import doctest
540 import os
541 if os.path.isdir(os.path.join("..","..","Tests")) :
542 print "Runing doctests..."
543 cur_dir = os.path.abspath(os.curdir)
544 os.chdir(os.path.join("..","..","Tests"))
545 assert os.path.isfile("Stockholm/simple.sth")
546 doctest.testmod()
547 os.chdir(cur_dir)
548 del cur_dir
549 print "Done"
550
551 if __name__ == "__main__" :
552 _test()
553