1
2
3
4
5 """
6 Bio.AlignIO support for the "stockholm" format (used in the PFAM database).
7
8 You are expected to use this module via the Bio.AlignIO functions (or the
9 Bio.SeqIO functions if you want to work directly with the gapped sequences).
10
11 For example, consider a Stockholm alignment file containing the following:
12
13 # STOCKHOLM 1.0
14 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
15 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
16 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
17 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
18 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
19
20 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
21 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
22 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
23 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
24 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
25 //
26
27 This is a single multiple sequence alignment, so you would probably load this
28 using the Bio.AlignIO.read() function:
29
30 >>> from Bio import AlignIO
31 >>> handle = open("Stockholm/simple.sth", "rU")
32 >>> align = AlignIO.read(handle, "stockholm")
33 >>> handle.close()
34 >>> print align
35 SingleLetterAlphabet() alignment with 2 rows and 104 columns
36 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
37 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
38 >>> for record in align :
39 ... print record.id, len(record)
40 AP001509.1 104
41 AE007476.1 104
42
43 This example file is clearly using RNA, so you might want the Alignment object
44 (and the SeqRecord objects it holds) to reflect this, rather than simple using
45 the default single letter alphabet as shown above. You can do this with an
46 optional argument to the Bio.AlignIO.read() function:
47
48 >>> from Bio import AlignIO
49 >>> from Bio.Alphabet import generic_rna
50 >>> handle = open("Stockholm/simple.sth", "rU")
51 >>> align = AlignIO.read(handle, "stockholm", alphabet=generic_rna)
52 >>> handle.close()
53 >>> print align
54 RNAAlphabet() alignment with 2 rows and 104 columns
55 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
56 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
57
58 In addition to the sequences themselves, this example alignment also includes
59 some GR lines for the secondary structure of the sequences. These are strings,
60 with one character for each letter in the associated sequence:
61
62 >>> for record in align :
63 ... print record.id
64 ... print record.seq
65 ... print record.letter_annotations['secondary_structure']
66 AP001509.1
67 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
68 -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
69 AE007476.1
70 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
71 -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
72
73 Any general annotation for each row is recorded in the SeqRecord's annotations
74 dictionary. You can output this alignment in many different file formats using
75 Bio.AlignIO.write(), or the Alignment object's format method:
76
77 >>> print align.format("fasta")
78 >AP001509.1
79 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-A
80 GGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
81 >AE007476.1
82 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAA
83 GGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
84 <BLANKLINE>
85
86 Most output formats won't be able to hold the annotation possible in a Stockholm file:
87
88 >>> print align.format("stockholm")
89 # STOCKHOLM 1.0
90 #=GF SQ 2
91 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
92 #=GS AP001509.1 AC AP001509.1
93 #=GS AP001509.1 DE AP001509.1
94 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
95 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
96 #=GS AE007476.1 AC AE007476.1
97 #=GS AE007476.1 DE AE007476.1
98 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
99 //
100 <BLANKLINE>
101
102 Note that when writing Stockholm files, Biopython does not break long sequences up and
103 interleave them (as in the input file shown above). The standard allows this simpler
104 layout, and it is more likely to be understood by other tools.
105
106 Finally, as an aside, it can sometimes be useful to use Bio.SeqIO.parse() to iterate over
107 the two rows as SeqRecord objects - rather than working with Alignnment objects.
108 Again, if you want to you can specify this is RNA:
109
110 >>> from Bio import SeqIO
111 >>> from Bio.Alphabet import generic_rna
112 >>> handle = open("Stockholm/simple.sth", "rU")
113 >>> for record in SeqIO.parse(handle, "stockholm", alphabet=generic_rna) :
114 ... print record.id
115 ... print record.seq
116 ... print record.letter_annotations['secondary_structure']
117 AP001509.1
118 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
119 -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
120 AE007476.1
121 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
122 -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
123 >>> handle.close()
124
125 """
126 from Bio.Align.Generic import Alignment
127 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
128
130 """Stockholm/PFAM alignment writer."""
131
132
133
134 pfam_gr_mapping = {"secondary_structure" : "SS",
135 "surface_accessibility" : "SA",
136 "transmembrane" : "TM",
137 "posterior_probability" : "PP",
138 "ligand_binding" : "LI",
139 "active_site" : "AS",
140 "intron" : "IN"}
141
142 pfam_gs_mapping = {"organism" : "OS",
143 "organism_classification" : "OC",
144 "look" : "LO"}
145
147 """Use this to write (another) single alignment to an open file.
148
149 Note that sequences and their annotation are recorded
150 together (rather than having a block of annotation followed
151 by a block of aligned sequences).
152 """
153 records = alignment.get_all_seqs()
154 count = len(records)
155
156 self._length_of_sequences = alignment.get_alignment_length()
157 self._ids_written = []
158
159
160
161
162 if count == 0 :
163 raise ValueError("Must have at least one sequence")
164 if self._length_of_sequences == 0 :
165 raise ValueError("Non-empty sequences are required")
166
167 self.handle.write("# STOCKHOLM 1.0\n")
168 self.handle.write("#=GF SQ %i\n" % count)
169 for record in records :
170 self._write_record(record)
171 self.handle.write("//\n")
172
174 """Write a single SeqRecord to the file"""
175 if self._length_of_sequences != len(record.seq) :
176 raise ValueError("Sequences must all be the same length")
177
178
179 seq_name = record.id
180 if record.name is not None :
181 if "accession" in record.annotations :
182 if record.id == record.annotations["accession"] :
183 seq_name = record.name
184
185
186 seq_name = seq_name.replace(" ","_")
187
188 if "start" in record.annotations \
189 and "end" in record.annotations :
190 suffix = "/%s-%s" % (str(record.annotations["start"]),
191 str(record.annotations["end"]))
192 if seq_name[-len(suffix):] != suffix :
193 seq_name = "%s/%s-%s" % (seq_name,
194 str(record.annotations["start"]),
195 str(record.annotations["end"]))
196
197 if seq_name in self._ids_written :
198 raise ValueError("Duplicate record identifier: %s" % seq_name)
199 self._ids_written.append(seq_name)
200 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring()))
201
202
203
204
205
206
207
208
209
210
211
212
213
214 if "accession" in record.annotations :
215 self.handle.write("#=GS %s AC %s\n" \
216 % (seq_name, self.clean(record.annotations["accession"])))
217 elif record.id :
218 self.handle.write("#=GS %s AC %s\n" \
219 % (seq_name, self.clean(record.id)))
220
221
222 if record.description :
223 self.handle.write("#=GS %s DE %s\n" \
224 % (seq_name, self.clean(record.description)))
225
226
227 for xref in record.dbxrefs :
228 self.handle.write("#=GS %s DR %s\n" \
229 % (seq_name, self.clean(xref)))
230
231
232 for key, value in record.annotations.iteritems() :
233 if key in self.pfam_gs_mapping :
234 self.handle.write("#=GS %s %s %s\n" \
235 % (seq_name,
236 self.clean(self.pfam_gs_mapping[key]),
237 self.clean(str(value))))
238 else :
239
240
241 pass
242
243
244 for key, value in record.letter_annotations.iteritems() :
245 if key in self.pfam_gr_mapping and len(str(value))==len(record.seq) :
246 self.handle.write("#=GR %s %s %s\n" \
247 % (seq_name,
248 self.clean(self.pfam_gr_mapping[key]),
249 self.clean(str(value))))
250 else :
251
252
253 pass
254
256 """Loads a Stockholm file from PFAM into Alignment objects.
257
258 The file may contain multiple concatenated alignments, which are loaded
259 and returned incrementally.
260
261 This parser will detect if the Stockholm file follows the PFAM
262 conventions for sequence specific meta-data (lines starting #=GS
263 and #=GR) and populates the SeqRecord fields accordingly.
264
265 Any annotation which does not follow the PFAM conventions is currently
266 ignored.
267
268 If an accession is provided for an entry in the meta data, IT WILL NOT
269 be used as the record.id (it will be recorded in the record's
270 annotations). This is because some files have (sub) sequences from
271 different parts of the same accession (differentiated by different
272 start-end positions).
273
274 Wrap-around alignments are not supported - each sequences must be on
275 a single line. However, interlaced sequences should work.
276
277 For more information on the file format, please see:
278 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format
279 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
280
281 For consistency with BioPerl and EMBOSS we call this the "stockholm"
282 format.
283 """
284
285
286
287 pfam_gr_mapping = {"SS" : "secondary_structure",
288 "SA" : "surface_accessibility",
289 "TM" : "transmembrane",
290 "PP" : "posterior_probability",
291 "LI" : "ligand_binding",
292 "AS" : "active_site",
293 "IN" : "intron"}
294
295 pfam_gs_mapping = {"OS" : "organism",
296 "OC" : "organism_classification",
297 "LO" : "look"}
298
300 try :
301 line = self._header
302 del self._header
303 except AttributeError :
304 line = self.handle.readline()
305 if not line:
306
307 return
308 if not line.strip() == '# STOCKHOLM 1.0':
309 raise ValueError("Did not find STOCKHOLM header")
310
311
312
313
314
315
316
317
318 seqs = {}
319 ids = []
320 gs = {}
321 gr = {}
322 gf = {}
323 passed_end_alignment = False
324 while 1:
325 line = self.handle.readline()
326 if not line: break
327 line = line.strip()
328 if line == '# STOCKHOLM 1.0':
329 self._header = line
330 break
331 elif line == "//" :
332
333
334 passed_end_alignment = True
335 elif line == "" :
336
337 pass
338 elif line[0] != "#" :
339
340
341 assert not passed_end_alignment
342 parts = [x.strip() for x in line.split(" ",1)]
343 if len(parts) != 2 :
344
345 raise ValueError("Could not split line into identifier " \
346 + "and sequence:\n" + line)
347 id, seq = parts
348 if id not in ids :
349 ids.append(id)
350 seqs.setdefault(id, '')
351 seqs[id] += seq.replace(".","-")
352 elif len(line) >= 5 :
353
354 if line[:5] == "#=GF " :
355
356
357 feature, text = line[5:].strip().split(None,1)
358
359
360 if feature not in gf :
361 gf[feature] = [text]
362 else :
363 gf[feature].append(text)
364 elif line[:5] == '#=GC ':
365
366
367 pass
368 elif line[:5] == '#=GS ':
369
370
371 id, feature, text = line[5:].strip().split(None,2)
372
373
374 if id not in gs :
375 gs[id] = {}
376 if feature not in gs[id] :
377 gs[id][feature] = [text]
378 else :
379 gs[id][feature].append(text)
380 elif line[:5] == "#=GR " :
381
382
383 id, feature, text = line[5:].strip().split(None,2)
384
385
386 if id not in gr :
387 gr[id] = {}
388 if feature not in gr[id]:
389 gr[id][feature] = ""
390 gr[id][feature] += text.strip()
391
392
393
394
395
396
397 assert len(seqs) <= len(ids)
398
399
400
401 self.ids = ids
402 self.sequences = seqs
403 self.seq_annotation = gs
404 self.seq_col_annotation = gr
405
406 if ids and seqs :
407
408 if self.records_per_alignment is not None \
409 and self.records_per_alignment != len(ids) :
410 raise ValueError("Found %i records in this alignment, told to expect %i" \
411 % (len(ids), self.records_per_alignment))
412
413 alignment = Alignment(self.alphabet)
414
415
416
417 alignment._annotations = gr
418
419 alignment_length = len(seqs.values()[0])
420 for id in ids :
421 seq = seqs[id]
422 if alignment_length != len(seq) :
423 raise ValueError("Sequences have different lengths, or repeated identifier")
424 name, start, end = self._identifier_split(id)
425 alignment.add_sequence(id, seq, start=start, end=end)
426
427 record = alignment.get_all_seqs()[-1]
428
429 assert record.id == id or record.description == id
430
431 record.id = id
432 record.name = name
433 record.description = id
434
435
436
437 record.annotations["accession"]=name
438
439 self._populate_meta_data(id, record)
440 return alignment
441 else :
442 return None
443
444
446 """Returns (name,start,end) string tuple from an identier."""
447 if identifier.find("/")!=-1 :
448 start_end = identifier.split("/",1)[1]
449 if start_end.count("-")==1 :
450 start, end = map(int, start_end.split("-"))
451 name = identifier.split("/",1)[0]
452 return (name, start, end)
453 return (identifier, None, None)
454
487
519
521 """Run the Bio.SeqIO module's doctests.
522
523 This will try and locate the unit tests directory, and run the doctests
524 from there in order that the relative paths used in the examples work.
525 """
526 import doctest
527 import os
528 if os.path.isdir(os.path.join("..","..","Tests")) :
529 print "Runing doctests..."
530 cur_dir = os.path.abspath(os.curdir)
531 os.chdir(os.path.join("..","..","Tests"))
532 assert os.path.isfile("Stockholm/simple.sth")
533 doctest.testmod()
534 os.chdir(cur_dir)
535 del cur_dir
536 print "Done"
537
538 if __name__ == "__main__" :
539 _test()
540