Package Bio :: Package Clustalw
[hide private]
[frames] | no frames]

Source Code for Package Bio.Clustalw

  1  # Clustalw modules 
  2   
  3  """ 
  4  A set of classes to interact with the multiple alignment command 
  5  line program clustalw.  
  6   
  7  Clustalw is the command line version of the graphical Clustalx  
  8  aligment program. 
  9   
 10  This requires clustalw available from: 
 11   
 12  ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/. 
 13   
 14  functions: 
 15  o read 
 16  o parse_file 
 17  o do_alignment 
 18   
 19  classes: 
 20  o ClustalAlignment 
 21  o MultipleAlignCL""" 
 22   
 23  # standard library 
 24  import os 
 25  import sys 
 26   
 27  # biopython 
 28  from Bio import Alphabet 
 29  from Bio.Alphabet import IUPAC 
 30  from Bio.Align.Generic import Alignment 
 31   
32 -def parse_file(file_name, alphabet = IUPAC.unambiguous_dna, debug_level = 0):
33 """Parse the given file into a clustal aligment object. 34 35 Arguments: 36 o file_name - The name of the file to parse. 37 o alphabet - The type of alphabet to use for the alignment sequences. 38 This should correspond to the type of information contained in the file. 39 Defaults to be unambiguous_dna sequence. 40 41 There is a deprecated optional argument debug_level which has no effect. 42 43 Since Biopython 1.46, this has called Bio.AlignIO internally. 44 """ 45 46 # Avoid code duplication by calling Bio.AlignIO to do this for us. 47 handle = open(file_name, 'r') 48 from Bio import AlignIO 49 generic_alignment = AlignIO.read(handle, "clustal") 50 handle.close() 51 52 #Force this generic alignment into a ClustalAlignment... nasty hack 53 if isinstance(alphabet, Alphabet.Gapped) : 54 alpha = alphabet 55 else : 56 alpha = Alphabet.Gapped(alphabet) 57 clustal_alignment = ClustalAlignment(alpha) 58 clustal_alignment._records = generic_alignment._records 59 for record in clustal_alignment._records : 60 record.seq.alphabet = alpha 61 62 try : 63 clustal_alignment._version = generic_alignment._version 64 except AttributeError : 65 #Missing the version, could be a 3rd party tool's output 66 pass 67 68 try : 69 clustal_alignment._star_info = generic_alignment._star_info 70 except AttributeError : 71 #Missing the consensus, again, this is not always present 72 pass 73 74 return clustal_alignment
75
76 -def do_alignment(command_line, alphabet=None):
77 """Perform an alignment with the given command line. 78 79 Arguments: 80 o command_line - A command line object that can give out 81 the command line we will input into clustalw. 82 o alphabet - the alphabet to use in the created alignment. If not 83 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for 84 dna and protein alignment respectively. 85 86 Returns: 87 o A clustal alignment object corresponding to the created alignment. 88 If the alignment type was not a clustal object, None is returned. 89 """ 90 #Try and use subprocess (available in python 2.4+) 91 try : 92 import subprocess 93 #We don't need to supply any piped input, but we setup the 94 #standard input pipe anyway as a work around for a python 95 #bug if this is called from a Windows GUI program. For 96 #details, see http://bugs.python.org/issue1124861 97 child_process = subprocess.Popen(str(command_line), 98 stdin=subprocess.PIPE, 99 stdout=subprocess.PIPE, 100 stderr=subprocess.PIPE, 101 shell=(sys.platform!="win32") 102 ) 103 #Use .communicate as can get deadlocks with .wait(), see Bug 2804 104 child_process.communicate() #ignore the stdout and strerr data 105 value = child_process.returncode 106 except ImportError : 107 #Fall back for python 2.3 108 run_clust = os.popen(str(command_line)) 109 status = run_clust.close() 110 # The exit status is the second byte of the termination status 111 # TODO - Check this holds on win32... 112 value = 0 113 if status: value = status / 256 114 115 # check the return value for errors, as on 1.81 the return value 116 # from Clustalw is actually helpful for figuring out errors 117 # 1 => bad command line option 118 if value == 1: 119 raise ValueError("Bad command line option in the command: %s" 120 % str(command_line)) 121 # 2 => can't open sequence file 122 elif value == 2: 123 raise IOError("Cannot open sequence file %s" 124 % command_line.sequence_file) 125 # 3 => wrong format in sequence file 126 elif value == 3: 127 raise IOError("Sequence file %s has an invalid format." 128 % command_line.sequence_file) 129 # 4 => sequence file only has one sequence 130 elif value == 4: 131 raise IOError("Sequence file %s has only one sequence present." 132 % command_line.sequence_file) 133 134 # if an output file was specified, we need to grab it 135 if command_line.output_file: 136 out_file = command_line.output_file 137 else: 138 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln' 139 140 # if we can't deal with the format, just return None 141 if command_line.output_type and command_line.output_type != 'CLUSTAL': 142 return None 143 # otherwise parse it into a ClustalAlignment object 144 else: 145 if not alphabet: 146 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[ 147 command_line.type == 'PROTEIN'] 148 149 # check if the outfile exists before parsing 150 if not(os.path.exists(out_file)): 151 raise IOError("Output .aln file %s not produced, commandline: %s" 152 % (out_file, command_line)) 153 154 return parse_file(out_file, alphabet)
155 156
157 -class ClustalAlignment(Alignment):
158 """Work with the clustal aligment format. 159 160 This format is the default output from clustal -- these files normally 161 have an extension of .aln. 162 """ 163 # the default version to use if one isn't set 164 DEFAULT_VERSION = '1.81' 165
166 - def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)):
167 Alignment.__init__(self, alphabet) 168 169 # represent all of those stars in the aln output format 170 self._star_info = '' 171 172 self._version = ''
173
174 - def __str__(self):
175 """Print out the alignment so it looks pretty. 176 177 The output produced from this should also be formatted in valid 178 clustal format. 179 """ 180 # Avoid code duplication by calling Bio.AlignIO to do this for us. 181 from Bio import AlignIO 182 from StringIO import StringIO 183 handle = StringIO() 184 AlignIO.write([self], handle, "clustal") 185 handle.seek(0) 186 return handle.read()
187
188 -def _escape_filename(filename) :
189 """Escape filenames with spaces (PRIVATE). 190 191 NOTE - This code is duplicated in Bio.Blast.NCBIStandalone, 192 scope for refactoring! 193 """ 194 if " " not in filename : 195 return filename 196 197 #We'll just quote it - works on Mac etc 198 if filename.startswith('"') and filename.endswith('"') : 199 #Its already quoted 200 return filename 201 else : 202 return '"%s"' % filename
203
204 -class MultipleAlignCL:
205 """Represent a clustalw multiple alignment command line. 206 207 This is meant to make it easy to code the command line options you 208 want to submit to clustalw. 209 210 Clustalw has a ton of options and things to do but this is set up to 211 represent a clustalw mutliple alignment. 212 213 Warning: I don't use all of these options personally, so if you find 214 one to be broken for any reason, please let us know! 215 """ 216 # set the valid options for different parameters 217 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA'] 218 OUTPUT_ORDER = ['INPUT', 'ALIGNED'] 219 OUTPUT_CASE = ['LOWER', 'UPPER'] 220 OUTPUT_SEQNOS = ['OFF', 'ON'] 221 RESIDUE_TYPES = ['PROTEIN', 'DNA'] 222 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID'] 223 DNA_MATRIX = ['IUB', 'CLUSTALW'] 224
225 - def __init__(self, sequence_file, command = 'clustalw'):
226 """Initialize some general parameters that can be set as attributes. 227 228 Arguments: 229 o sequence_file - The file to read the sequences for alignment from. 230 o command - The command used to run clustalw. This defaults to 231 just 'clustalw' (ie. assumes you have it on your path somewhere). 232 233 General attributes that can be set: 234 o is_quick - if set as 1, will use a fast algorithm to create 235 the alignment guide tree. 236 o allow_negative - allow negative values in the alignment matrix. 237 238 Multiple alignment attributes that can be set as attributes: 239 o gap_open_pen - Gap opening penalty 240 o gap_ext_pen - Gap extension penalty 241 o is_no_end_pen - A flag as to whether or not there should be a gap 242 separation penalty for the ends. 243 o gap_sep_range - The gap separation penalty range. 244 o is_no_pgap - A flag to turn off residue specific gaps 245 o is_no_hgap - A flag to turn off hydrophilic gaps 246 o h_gap_residues - A list of residues to count a hydrophilic 247 o max_div - A percent identity to use for delay (? - I don't undertand 248 this!) 249 o trans_weight - The weight to use for transitions 250 """ 251 self.sequence_file = sequence_file 252 self.command = command 253 254 self.is_quick = None 255 self.allow_negative = None 256 257 self.gap_open_pen = None 258 self.gap_ext_pen = None 259 self.is_no_end_pen = None 260 self.gap_sep_range = None 261 self.is_no_pgap = None 262 self.is_no_hgap = None 263 self.h_gap_residues = [] 264 self.max_div = None 265 self.trans_weight = None 266 267 # other attributes that should be set via various functions 268 # 1. output parameters 269 self.output_file = None 270 self.output_type = None 271 self.output_order = None 272 self.change_case = None 273 self.add_seqnos = None 274 275 # 2. a guide tree to use 276 self.guide_tree = None 277 self.new_tree = None 278 279 # 3. matrices 280 self.protein_matrix = None 281 self.dna_matrix = None 282 283 # 4. type of residues 284 self.type = None
285
286 - def __str__(self):
287 """Write out the command line as a string.""" 288 289 #On Linux with clustalw 1.83, you can do: 290 #clustalw input.faa 291 #clustalw /full/path/input.faa 292 #clustalw -INFILE=input.faa 293 #clustalw -INFILE=/full/path/input.faa 294 # 295 #Note these fail (using DOS style slashes): 296 # 297 #clustalw /INFILE=input.faa 298 #clustalw /INFILE=/full/path/input.faa 299 # 300 #On Windows XP with clustalw.exe 1.83, these work at 301 #the command prompt: 302 # 303 #clustalw.exe input.faa 304 #clustalw.exe /INFILE=input.faa 305 #clustalw.exe /INFILE="input.faa" 306 #clustalw.exe /INFILE="with space.faa" 307 #clustalw.exe /INFILE=C:\full\path\input.faa 308 #clustalw.exe /INFILE="C:\full path\with spaces.faa" 309 # 310 #Sadly these fail: 311 #clustalw.exe "input.faa" 312 #clustalw.exe "with space.faa" 313 #clustalw.exe C:\full\path\input.faa 314 #clustalw.exe "C:\full path\with spaces.faa" 315 # 316 #Testing today (using a different binary of clustalw.exe 1.83), 317 #using -INFILE as follows seems to work. However I had once noted: 318 #These also fail but a minus/dash does seem to 319 #work with other options (!): 320 #clustalw.exe -INFILE=input.faa 321 #clustalw.exe -INFILE=C:\full\path\input.faa 322 # 323 #Also these fail: 324 #clustalw.exe "/INFILE=input.faa" 325 #clustalw.exe "/INFILE=C:\full\path\input.faa" 326 # 327 #Thanks to Emanuel Hey for flagging this on the mailing list. 328 # 329 #In addition, both self.command and self.sequence_file 330 #may contain spaces, so should be quoted. But clustalw 331 #is fussy. 332 cline = _escape_filename(self.command) 333 cline += ' -INFILE=%s' % _escape_filename(self.sequence_file) 334 335 # general options 336 if self.type: 337 cline += " -TYPE=%s" % self.type 338 if self.is_quick == 1: 339 #Some versions of clustalw are case sensitive, 340 #and require -quicktree rather than -QUICKTREE 341 cline += " -quicktree" 342 if self.allow_negative == 1: 343 cline += " -NEGATIVE" 344 345 # output options 346 if self.output_file: 347 cline += " -OUTFILE=%s" % _escape_filename(self.output_file) 348 if self.output_type: 349 cline += " -OUTPUT=%s" % self.output_type 350 if self.output_order: 351 cline += " -OUTORDER=%s" % self.output_order 352 if self.change_case: 353 cline += " -CASE=%s" % self.change_case 354 if self.add_seqnos: 355 cline += " -SEQNOS=%s" % self.add_seqnos 356 if self.new_tree: 357 # clustal does not work if -align is written -ALIGN 358 cline += " -NEWTREE=%s -align" % _escape_filename(self.new_tree) 359 360 # multiple alignment options 361 if self.guide_tree: 362 cline += " -USETREE=%s" % _escape_filename(self.guide_tree) 363 if self.protein_matrix: 364 cline += " -MATRIX=%s" % self.protein_matrix 365 if self.dna_matrix: 366 cline += " -DNAMATRIX=%s" % self.dna_matrix 367 if self.gap_open_pen: 368 cline += " -GAPOPEN=%s" % self.gap_open_pen 369 if self.gap_ext_pen: 370 cline += " -GAPEXT=%s" % self.gap_ext_pen 371 if self.is_no_end_pen == 1: 372 cline += " -ENDGAPS" 373 if self.gap_sep_range: 374 cline += " -GAPDIST=%s" % self.gap_sep_range 375 if self.is_no_pgap == 1: 376 cline += " -NOPGAP" 377 if self.is_no_hgap == 1: 378 cline += " -NOHGAP" 379 if len(self.h_gap_residues) != 0: 380 # stick the list of residues together as one big list o' residues 381 residue_list = '' 382 for residue in self.h_gap_residues: 383 residue_list = residue_list + residue 384 cline += " -HGAPRESIDUES=%s" % residue_list 385 if self.max_div: 386 cline += " -MAXDIV=%s" % self.max_div 387 if self.trans_weight: 388 cline += " -TRANSWEIGHT=%s" % self.trans_weight 389 390 return cline
391
392 - def set_output(self, output_file, output_type = None, output_order = None, 393 change_case = None, add_seqnos = None):
394 """Set the output parameters for the command line. 395 """ 396 self.output_file = output_file 397 398 if output_type: 399 output_type = output_type.upper() 400 if output_type not in self.OUTPUT_TYPES: 401 raise ValueError("Invalid output type %s. Valid choices are %s" 402 % (output_type, self.OUTPUT_TYPES)) 403 else: 404 self.output_type = output_type 405 406 if output_order: 407 output_order = output_order.upper() 408 if output_order not in self.OUTPUT_ORDER: 409 raise ValueError("Invalid output order %s. Valid choices are %s" 410 % (output_order, self.OUTPUT_ORDER)) 411 else: 412 self.output_order = output_order 413 414 if change_case: 415 change_case = change_case.upper() 416 if output_type != "GDE": 417 raise ValueError("Change case only valid for GDE output.") 418 elif change_case not in self.CHANGE_CASE: 419 raise ValueError("Invalid change case %s. Valid choices are %s" 420 % (change_case, self.CHANGE_CASE)) 421 else: 422 self.change_case = change_case 423 424 if add_seqnos: 425 add_seqnos = add_seqnos.upper() 426 if output_type: 427 raise ValueError("Add SeqNos only valid for CLUSTAL output.") 428 elif add_seqnos not in self.OUTPUT_SEQNOS: 429 raise ValueError("Invalid seqnos option %s. Valid choices: %s" 430 % (add_seqnos, self.OUTPUT_SEQNOS)) 431 else: 432 self.add_seqnos = add_seqnos
433
434 - def set_guide_tree(self, tree_file):
435 """Provide a file to use as the guide tree for alignment. 436 437 Raises: 438 o IOError - If the tree_file doesn't exist.""" 439 if not(os.path.exists(tree_file)): 440 raise IOError("Could not find the guide tree file %s." % 441 tree_file) 442 else: 443 self.guide_tree = tree_file
444
445 - def set_new_guide_tree(self, tree_file):
446 """Set the name of the guide tree file generated in the alignment. 447 """ 448 self.new_tree = tree_file
449
450 - def set_protein_matrix(self, protein_matrix):
451 """Set the type of protein matrix to use. 452 453 Protein matrix can be either one of the defined types (blosum, pam, 454 gonnet or id) or a file with your own defined matrix. 455 """ 456 if protein_matrix.upper() in self.PROTEIN_MATRIX: 457 self.protein_matrix = protein_matrix.upper() 458 elif os.path.exists(protein_matrix): 459 self.protein_matrix = protein_matrix 460 else: 461 raise ValueError("Invalid matrix %s. Options are %s or a file." % 462 (protein_matrix.upper(), self.PROTEIN_MATRIX))
463
464 - def set_dna_matrix(self, dna_matrix):
465 """Set the type of DNA matrix to use. 466 467 The dna_matrix can either be one of the defined types (iub or clustalw) 468 or a file with the matrix to use.""" 469 if dna_matrix.upper() in self.DNA_MATRIX: 470 self.dna_matrix = dna_matrix.upper() 471 elif os.path.exists(dna_matrix): 472 self.dna_matrix = dna_matrix 473 else: 474 raise ValueError("Invalid matrix %s. Options are %s or a file." % 475 (dna_matrix, self.DNA_MATRIX))
476
477 - def set_type(self, residue_type):
478 """Set the type of residues within the file. 479 480 Clustal tries to guess whether the info is protein or DNA based on 481 the number of GATCs, but this can be wrong if you have a messed up 482 protein or DNA you are working with, so this allows you to set it 483 explicitly. 484 """ 485 residue_type = residue_type.upper() 486 if residue_type in self.RESIDUE_TYPES: 487 self.type = residue_type 488 else: 489 raise ValueError("Invalid residue type %s. Valid choices are %s" 490 % (residue_type, self.RESIDUE_TYPES))
491