1
2
3 """
4 A set of classes to interact with the multiple alignment command
5 line program clustalw.
6
7 Clustalw is the command line version of the graphical Clustalx
8 aligment program.
9
10 This requires clustalw available from:
11
12 ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/.
13
14 functions:
15 o read
16 o parse_file
17 o do_alignment
18
19 classes:
20 o ClustalAlignment
21 o MultipleAlignCL"""
22
23
24 import os
25 import sys
26
27
28 from Bio import Alphabet
29 from Bio.Alphabet import IUPAC
30 from Bio.Align.Generic import Alignment
31
33 """Parse the given file into a clustal aligment object.
34
35 Arguments:
36 o file_name - The name of the file to parse.
37 o alphabet - The type of alphabet to use for the alignment sequences.
38 This should correspond to the type of information contained in the file.
39 Defaults to be unambiguous_dna sequence.
40
41 There is a deprecated optional argument debug_level which has no effect.
42
43 Since Biopython 1.46, this has called Bio.AlignIO internally.
44 """
45
46
47 handle = open(file_name, 'r')
48 from Bio import AlignIO
49 generic_alignment = AlignIO.read(handle, "clustal")
50 handle.close()
51
52
53 if isinstance(alphabet, Alphabet.Gapped) :
54 alpha = alphabet
55 else :
56 alpha = Alphabet.Gapped(alphabet)
57 clustal_alignment = ClustalAlignment(alpha)
58 clustal_alignment._records = generic_alignment._records
59 for record in clustal_alignment._records :
60 record.seq.alphabet = alpha
61
62 try :
63 clustal_alignment._version = generic_alignment._version
64 except AttributeError :
65
66 pass
67
68 try :
69 clustal_alignment._star_info = generic_alignment._star_info
70 except AttributeError :
71
72 pass
73
74 return clustal_alignment
75
77 """Perform an alignment with the given command line.
78
79 Arguments:
80 o command_line - A command line object that can give out
81 the command line we will input into clustalw.
82 o alphabet - the alphabet to use in the created alignment. If not
83 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for
84 dna and protein alignment respectively.
85
86 Returns:
87 o A clustal alignment object corresponding to the created alignment.
88 If the alignment type was not a clustal object, None is returned.
89 """
90
91 try :
92 import subprocess
93
94
95
96
97 child_process = subprocess.Popen(str(command_line),
98 stdin=subprocess.PIPE,
99 stdout=subprocess.PIPE,
100 stderr=subprocess.PIPE,
101 shell=(sys.platform!="win32")
102 )
103
104 child_process.communicate()
105 value = child_process.returncode
106 except ImportError :
107
108 run_clust = os.popen(str(command_line))
109 status = run_clust.close()
110
111
112 value = 0
113 if status: value = status / 256
114
115
116
117
118 if value == 1:
119 raise ValueError("Bad command line option in the command: %s"
120 % str(command_line))
121
122 elif value == 2:
123 raise IOError("Cannot open sequence file %s"
124 % command_line.sequence_file)
125
126 elif value == 3:
127 raise IOError("Sequence file %s has an invalid format."
128 % command_line.sequence_file)
129
130 elif value == 4:
131 raise IOError("Sequence file %s has only one sequence present."
132 % command_line.sequence_file)
133
134
135 if command_line.output_file:
136 out_file = command_line.output_file
137 else:
138 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln'
139
140
141 if command_line.output_type and command_line.output_type != 'CLUSTAL':
142 return None
143
144 else:
145 if not alphabet:
146 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[
147 command_line.type == 'PROTEIN']
148
149
150 if not(os.path.exists(out_file)):
151 raise IOError("Output .aln file %s not produced, commandline: %s"
152 % (out_file, command_line))
153
154 return parse_file(out_file, alphabet)
155
156
158 """Work with the clustal aligment format.
159
160 This format is the default output from clustal -- these files normally
161 have an extension of .aln.
162 """
163
164 DEFAULT_VERSION = '1.81'
165
173
175 """Print out the alignment so it looks pretty.
176
177 The output produced from this should also be formatted in valid
178 clustal format.
179 """
180
181 from Bio import AlignIO
182 from StringIO import StringIO
183 handle = StringIO()
184 AlignIO.write([self], handle, "clustal")
185 handle.seek(0)
186 return handle.read()
187
189 """Escape filenames with spaces (PRIVATE).
190
191 NOTE - This code is duplicated in Bio.Blast.NCBIStandalone,
192 scope for refactoring!
193 """
194 if " " not in filename :
195 return filename
196
197
198 if filename.startswith('"') and filename.endswith('"') :
199
200 return filename
201 else :
202 return '"%s"' % filename
203
205 """Represent a clustalw multiple alignment command line.
206
207 This is meant to make it easy to code the command line options you
208 want to submit to clustalw.
209
210 Clustalw has a ton of options and things to do but this is set up to
211 represent a clustalw mutliple alignment.
212
213 Warning: I don't use all of these options personally, so if you find
214 one to be broken for any reason, please let us know!
215 """
216
217 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA']
218 OUTPUT_ORDER = ['INPUT', 'ALIGNED']
219 OUTPUT_CASE = ['LOWER', 'UPPER']
220 OUTPUT_SEQNOS = ['OFF', 'ON']
221 RESIDUE_TYPES = ['PROTEIN', 'DNA']
222 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID']
223 DNA_MATRIX = ['IUB', 'CLUSTALW']
224
225 - def __init__(self, sequence_file, command = 'clustalw'):
226 """Initialize some general parameters that can be set as attributes.
227
228 Arguments:
229 o sequence_file - The file to read the sequences for alignment from.
230 o command - The command used to run clustalw. This defaults to
231 just 'clustalw' (ie. assumes you have it on your path somewhere).
232
233 General attributes that can be set:
234 o is_quick - if set as 1, will use a fast algorithm to create
235 the alignment guide tree.
236 o allow_negative - allow negative values in the alignment matrix.
237
238 Multiple alignment attributes that can be set as attributes:
239 o gap_open_pen - Gap opening penalty
240 o gap_ext_pen - Gap extension penalty
241 o is_no_end_pen - A flag as to whether or not there should be a gap
242 separation penalty for the ends.
243 o gap_sep_range - The gap separation penalty range.
244 o is_no_pgap - A flag to turn off residue specific gaps
245 o is_no_hgap - A flag to turn off hydrophilic gaps
246 o h_gap_residues - A list of residues to count a hydrophilic
247 o max_div - A percent identity to use for delay (? - I don't undertand
248 this!)
249 o trans_weight - The weight to use for transitions
250 """
251 self.sequence_file = sequence_file
252 self.command = command
253
254 self.is_quick = None
255 self.allow_negative = None
256
257 self.gap_open_pen = None
258 self.gap_ext_pen = None
259 self.is_no_end_pen = None
260 self.gap_sep_range = None
261 self.is_no_pgap = None
262 self.is_no_hgap = None
263 self.h_gap_residues = []
264 self.max_div = None
265 self.trans_weight = None
266
267
268
269 self.output_file = None
270 self.output_type = None
271 self.output_order = None
272 self.change_case = None
273 self.add_seqnos = None
274
275
276 self.guide_tree = None
277 self.new_tree = None
278
279
280 self.protein_matrix = None
281 self.dna_matrix = None
282
283
284 self.type = None
285
287 """Write out the command line as a string."""
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332 cline = _escape_filename(self.command)
333 cline += ' -INFILE=%s' % _escape_filename(self.sequence_file)
334
335
336 if self.type:
337 cline += " -TYPE=%s" % self.type
338 if self.is_quick == 1:
339
340
341 cline += " -quicktree"
342 if self.allow_negative == 1:
343 cline += " -NEGATIVE"
344
345
346 if self.output_file:
347 cline += " -OUTFILE=%s" % _escape_filename(self.output_file)
348 if self.output_type:
349 cline += " -OUTPUT=%s" % self.output_type
350 if self.output_order:
351 cline += " -OUTORDER=%s" % self.output_order
352 if self.change_case:
353 cline += " -CASE=%s" % self.change_case
354 if self.add_seqnos:
355 cline += " -SEQNOS=%s" % self.add_seqnos
356 if self.new_tree:
357
358 cline += " -NEWTREE=%s -align" % _escape_filename(self.new_tree)
359
360
361 if self.guide_tree:
362 cline += " -USETREE=%s" % _escape_filename(self.guide_tree)
363 if self.protein_matrix:
364 cline += " -MATRIX=%s" % self.protein_matrix
365 if self.dna_matrix:
366 cline += " -DNAMATRIX=%s" % self.dna_matrix
367 if self.gap_open_pen:
368 cline += " -GAPOPEN=%s" % self.gap_open_pen
369 if self.gap_ext_pen:
370 cline += " -GAPEXT=%s" % self.gap_ext_pen
371 if self.is_no_end_pen == 1:
372 cline += " -ENDGAPS"
373 if self.gap_sep_range:
374 cline += " -GAPDIST=%s" % self.gap_sep_range
375 if self.is_no_pgap == 1:
376 cline += " -NOPGAP"
377 if self.is_no_hgap == 1:
378 cline += " -NOHGAP"
379 if len(self.h_gap_residues) != 0:
380
381 residue_list = ''
382 for residue in self.h_gap_residues:
383 residue_list = residue_list + residue
384 cline += " -HGAPRESIDUES=%s" % residue_list
385 if self.max_div:
386 cline += " -MAXDIV=%s" % self.max_div
387 if self.trans_weight:
388 cline += " -TRANSWEIGHT=%s" % self.trans_weight
389
390 return cline
391
392 - def set_output(self, output_file, output_type = None, output_order = None,
393 change_case = None, add_seqnos = None):
394 """Set the output parameters for the command line.
395 """
396 self.output_file = output_file
397
398 if output_type:
399 output_type = output_type.upper()
400 if output_type not in self.OUTPUT_TYPES:
401 raise ValueError("Invalid output type %s. Valid choices are %s"
402 % (output_type, self.OUTPUT_TYPES))
403 else:
404 self.output_type = output_type
405
406 if output_order:
407 output_order = output_order.upper()
408 if output_order not in self.OUTPUT_ORDER:
409 raise ValueError("Invalid output order %s. Valid choices are %s"
410 % (output_order, self.OUTPUT_ORDER))
411 else:
412 self.output_order = output_order
413
414 if change_case:
415 change_case = change_case.upper()
416 if output_type != "GDE":
417 raise ValueError("Change case only valid for GDE output.")
418 elif change_case not in self.CHANGE_CASE:
419 raise ValueError("Invalid change case %s. Valid choices are %s"
420 % (change_case, self.CHANGE_CASE))
421 else:
422 self.change_case = change_case
423
424 if add_seqnos:
425 add_seqnos = add_seqnos.upper()
426 if output_type:
427 raise ValueError("Add SeqNos only valid for CLUSTAL output.")
428 elif add_seqnos not in self.OUTPUT_SEQNOS:
429 raise ValueError("Invalid seqnos option %s. Valid choices: %s"
430 % (add_seqnos, self.OUTPUT_SEQNOS))
431 else:
432 self.add_seqnos = add_seqnos
433
435 """Provide a file to use as the guide tree for alignment.
436
437 Raises:
438 o IOError - If the tree_file doesn't exist."""
439 if not(os.path.exists(tree_file)):
440 raise IOError("Could not find the guide tree file %s." %
441 tree_file)
442 else:
443 self.guide_tree = tree_file
444
446 """Set the name of the guide tree file generated in the alignment.
447 """
448 self.new_tree = tree_file
449
451 """Set the type of protein matrix to use.
452
453 Protein matrix can be either one of the defined types (blosum, pam,
454 gonnet or id) or a file with your own defined matrix.
455 """
456 if protein_matrix.upper() in self.PROTEIN_MATRIX:
457 self.protein_matrix = protein_matrix.upper()
458 elif os.path.exists(protein_matrix):
459 self.protein_matrix = protein_matrix
460 else:
461 raise ValueError("Invalid matrix %s. Options are %s or a file." %
462 (protein_matrix.upper(), self.PROTEIN_MATRIX))
463
465 """Set the type of DNA matrix to use.
466
467 The dna_matrix can either be one of the defined types (iub or clustalw)
468 or a file with the matrix to use."""
469 if dna_matrix.upper() in self.DNA_MATRIX:
470 self.dna_matrix = dna_matrix.upper()
471 elif os.path.exists(dna_matrix):
472 self.dna_matrix = dna_matrix
473 else:
474 raise ValueError("Invalid matrix %s. Options are %s or a file." %
475 (dna_matrix, self.DNA_MATRIX))
476
478 """Set the type of residues within the file.
479
480 Clustal tries to guess whether the info is protein or DNA based on
481 the number of GATCs, but this can be wrong if you have a messed up
482 protein or DNA you are working with, so this allows you to set it
483 explicitly.
484 """
485 residue_type = residue_type.upper()
486 if residue_type in self.RESIDUE_TYPES:
487 self.type = residue_type
488 else:
489 raise ValueError("Invalid residue type %s. Valid choices are %s"
490 % (residue_type, self.RESIDUE_TYPES))
491