Package Bio :: Package Clustalw
[hide private]
[frames] | no frames]

Source Code for Package Bio.Clustalw

  1  # Clustalw modules 
  2   
  3  """ 
  4  A set of classes to interact with the multiple alignment command 
  5  line program clustalw.  
  6   
  7  Clustalw is the command line version of the graphical Clustalx  
  8  aligment program. 
  9   
 10  This requires clustalw available from: 
 11   
 12  ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/. 
 13   
 14  functions: 
 15  o read 
 16  o parse_file 
 17  o do_alignment 
 18   
 19  classes: 
 20  o ClustalAlignment 
 21  o MultipleAlignCL""" 
 22   
 23  # standard library 
 24  import os 
 25  import sys 
 26   
 27  # biopython 
 28  from Bio import Alphabet 
 29  from Bio.Alphabet import IUPAC 
 30  from Bio.Align.Generic import Alignment 
 31   
32 -def parse_file(file_name, alphabet = IUPAC.unambiguous_dna, debug_level = 0):
33 """Parse the given file into a clustal aligment object. 34 35 Arguments: 36 o file_name - The name of the file to parse. 37 o alphabet - The type of alphabet to use for the alignment sequences. 38 This should correspond to the type of information contained in the file. 39 Defaults to be unambiguous_dna sequence. 40 41 There is a deprecated optional argument debug_level which has no effect. 42 """ 43 44 # Avoid code duplication by calling Bio.AlignIO to do this for us. 45 handle = open(file_name, 'r') 46 from Bio import AlignIO 47 generic_alignment = AlignIO.read(handle, "clustal") 48 handle.close() 49 50 #Force this generic alignment into a ClustalAlignment... nasty hack 51 if isinstance(alphabet, Alphabet.Gapped) : 52 alpha = alphabet 53 else : 54 alpha = Alphabet.Gapped(alphabet) 55 clustal_alignment = ClustalAlignment(alpha) 56 clustal_alignment._records = generic_alignment._records 57 for record in clustal_alignment._records : 58 record.seq.alphabet = alpha 59 clustal_alignment._version = generic_alignment._version 60 clustal_alignment._star_info = generic_alignment._star_info 61 62 return clustal_alignment
63
64 -def do_alignment(command_line, alphabet=None):
65 """Perform an alignment with the given command line. 66 67 Arguments: 68 o command_line - A command line object that can give out 69 the command line we will input into clustalw. 70 o alphabet - the alphabet to use in the created alignment. If not 71 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for 72 dna and protein alignment respectively. 73 74 Returns: 75 o A clustal alignment object corresponding to the created alignment. 76 If the alignment type was not a clustal object, None is returned. 77 """ 78 run_clust = os.popen(str(command_line)) 79 status = run_clust.close() 80 81 82 # The exit status is the second byte of the termination status 83 # TODO - Check this holds on win32... 84 value = 0 85 if status: value = status / 256 86 # check the return value for errors, as on 1.81 the return value 87 # from Clustalw is actually helpful for figuring out errors 88 # 1 => bad command line option 89 if value == 1: 90 raise ValueError("Bad command line option in the command: %s" 91 % str(command_line)) 92 # 2 => can't open sequence file 93 elif value == 2: 94 raise IOError("Cannot open sequence file %s" 95 % command_line.sequence_file) 96 # 3 => wrong format in sequence file 97 elif value == 3: 98 raise IOError("Sequence file %s has an invalid format." 99 % command_line.sequence_file) 100 # 4 => sequence file only has one sequence 101 elif value == 4: 102 raise IOError("Sequence file %s has only one sequence present." 103 % command_line.sequence_file) 104 105 # if an output file was specified, we need to grab it 106 if command_line.output_file: 107 out_file = command_line.output_file 108 else: 109 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln' 110 111 # if we can't deal with the format, just return None 112 if command_line.output_type and command_line.output_type != 'CLUSTAL': 113 return None 114 # otherwise parse it into a ClustalAlignment object 115 else: 116 if not alphabet: 117 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[ 118 command_line.type == 'PROTEIN'] 119 120 # check if the outfile exists before parsing 121 if not(os.path.exists(out_file)): 122 raise IOError("Output .aln file %s not produced, commandline: %s" 123 % (out_file, command_line)) 124 125 return parse_file(out_file, alphabet)
126 127
128 -class ClustalAlignment(Alignment):
129 """Work with the clustal aligment format. 130 131 This format is the default output from clustal -- these files normally 132 have an extension of .aln. 133 """ 134 # the default version to use if one isn't set 135 DEFAULT_VERSION = '1.81' 136
137 - def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)):
138 Alignment.__init__(self, alphabet) 139 140 # represent all of those stars in the aln output format 141 self._star_info = '' 142 143 self._version = ''
144
145 - def __str__(self):
146 """Print out the alignment so it looks pretty. 147 148 The output produced from this should also be formatted in valid 149 clustal format. 150 """ 151 # Avoid code duplication by calling Bio.AlignIO to do this for us. 152 from Bio import AlignIO 153 from StringIO import StringIO 154 handle = StringIO() 155 AlignIO.write([self], handle, "clustal") 156 handle.seek(0) 157 return handle.read()
158 159
160 - def _add_star_info(self, stars):
161 """Add all of the stars, which indicate consensus sequence. 162 """ 163 self._star_info = stars
164
165 - def _add_version(self, version):
166 """Add the version information about the clustal file being read. 167 """ 168 self._version = version
169 170
171 -class MultipleAlignCL:
172 """Represent a clustalw multiple alignment command line. 173 174 This is meant to make it easy to code the command line options you 175 want to submit to clustalw. 176 177 Clustalw has a ton of options and things to do but this is set up to 178 represent a clustalw mutliple alignment. 179 180 Warning: I don't use all of these options personally, so if you find 181 one to be broken for any reason, please let us know! 182 """ 183 # set the valid options for different parameters 184 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA'] 185 OUTPUT_ORDER = ['INPUT', 'ALIGNED'] 186 OUTPUT_CASE = ['LOWER', 'UPPER'] 187 OUTPUT_SEQNOS = ['OFF', 'ON'] 188 RESIDUE_TYPES = ['PROTEIN', 'DNA'] 189 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID'] 190 DNA_MATRIX = ['IUB', 'CLUSTALW'] 191
192 - def __init__(self, sequence_file, command = 'clustalw'):
193 """Initialize some general parameters that can be set as attributes. 194 195 Arguments: 196 o sequence_file - The file to read the sequences for alignment from. 197 o command - The command used to run clustalw. This defaults to 198 just 'clustalw' (ie. assumes you have it on your path somewhere). 199 200 General attributes that can be set: 201 o is_quick - if set as 1, will use a fast algorithm to create 202 the alignment guide tree. 203 o allow_negative - allow negative values in the alignment matrix. 204 205 Multiple alignment attributes that can be set as attributes: 206 o gap_open_pen - Gap opening penalty 207 o gap_ext_pen - Gap extension penalty 208 o is_no_end_pen - A flag as to whether or not there should be a gap 209 separation penalty for the ends. 210 o gap_sep_range - The gap separation penalty range. 211 o is_no_pgap - A flag to turn off residue specific gaps 212 o is_no_hgap - A flag to turn off hydrophilic gaps 213 o h_gap_residues - A list of residues to count a hydrophilic 214 o max_div - A percent identity to use for delay (? - I don't undertand 215 this!) 216 o trans_weight - The weight to use for transitions 217 """ 218 self.sequence_file = sequence_file 219 self.command = command 220 221 self.is_quick = None 222 self.allow_negative = None 223 224 self.gap_open_pen = None 225 self.gap_ext_pen = None 226 self.is_no_end_pen = None 227 self.gap_sep_range = None 228 self.is_no_pgap = None 229 self.is_no_hgap = None 230 self.h_gap_residues = [] 231 self.max_div = None 232 self.trans_weight = None 233 234 # other attributes that should be set via various functions 235 # 1. output parameters 236 self.output_file = None 237 self.output_type = None 238 self.output_order = None 239 self.change_case = None 240 self.add_seqnos = None 241 242 # 2. a guide tree to use 243 self.guide_tree = None 244 self.new_tree = None 245 246 # 3. matrices 247 self.protein_matrix = None 248 self.dna_matrix = None 249 250 # 4. type of residues 251 self.type = None
252
253 - def __str__(self):
254 """Write out the command line as a string.""" 255 256 if sys.platform <> "win32" : 257 #On Linux with clustalw 1.83, you can do: 258 #clustalw input.faa 259 #clustalw /full/path/input.faa 260 #clustalw -INFILE=input.faa 261 #clustalw -INFILE=/full/path/input.faa 262 # 263 #Note these fail (using DOS style slashes): 264 # 265 #clustalw /INFILE=input.faa 266 #clustalw /INFILE=/full/path/input.faa 267 # 268 #To keep things simple, and follow the original 269 #behaviour of Bio.Clustalw use this: 270 cline = self.command + " " + self.sequence_file 271 else : 272 #On Windows XP with clustalw.exe 1.83, these work at 273 #the command prompt: 274 # 275 #clustalw.exe input.faa 276 #clustalw.exe /INFILE=input.faa 277 #clustalw.exe /INFILE="input.faa" 278 #clustalw.exe /INFILE="with space.faa" 279 #clustalw.exe /INFILE=C:\full\path\input.faa 280 #clustalw.exe /INFILE="C:\full path\with spaces.faa" 281 # 282 #Sadly these fail: 283 #clustalw.exe "input.faa" 284 #clustalw.exe "with space.faa" 285 #clustalw.exe C:\full\path\input.faa 286 #clustalw.exe "C:\full path\with spaces.faa" 287 # 288 #These also fail but a minus/dash does seem to 289 #work with other options (!): 290 #clustalw.exe -INFILE=input.faa 291 #clustalw.exe -INFILE=C:\full\path\input.faa 292 # 293 #Also these fail: 294 #clustalw.exe "/INFILE=input.faa" 295 #clustalw.exe "/INFILE=C:\full\path\input.faa" 296 # 297 #Thanks to Emanuel Hey for flagging this on the mailing list. 298 # 299 #In addtion, both self.command and self.sequence_file 300 #may contain spaces, so should be quoted. But clustalw 301 #is fussy. 302 if self.command.count(" ") > 0 : 303 cline = '"%s"' % self.command 304 else : 305 cline = self.command 306 if self.sequence_file.count(" ") > 0 : 307 cline += ' /INFILE="%s"' % self.sequence_file 308 else : 309 cline += ' /INFILE=%s' % self.sequence_file 310 311 # general options 312 if self.type: 313 cline += " -TYPE=%s" % self.type 314 if self.is_quick == 1: 315 #Some versions of clustalw are case sensitive, 316 #and require -quicktree rather than -QUICKTREE 317 cline += " -quicktree" 318 if self.allow_negative == 1: 319 cline += " -NEGATIVE" 320 321 # output options 322 if self.output_file: 323 cline += " -OUTFILE=%s" % self.output_file 324 if self.output_type: 325 cline += " -OUTPUT=%s" % self.output_type 326 if self.output_order: 327 cline += " -OUTORDER=%s" % self.output_order 328 if self.change_case: 329 cline += " -CASE=%s" % self.change_case 330 if self.add_seqnos: 331 cline += " -SEQNOS=%s" % self.add_seqnos 332 if self.new_tree: 333 # clustal does not work if -align is written -ALIGN 334 cline += " -NEWTREE=%s -align" % self.new_tree 335 336 # multiple alignment options 337 if self.guide_tree: 338 cline += " -USETREE=%s" % self.guide_tree 339 if self.protein_matrix: 340 cline += " -MATRIX=%s" % self.protein_matrix 341 if self.dna_matrix: 342 cline += " -DNAMATRIX=%s" % self.dna_matrix 343 if self.gap_open_pen: 344 cline += " -GAPOPEN=%s" % self.gap_open_pen 345 if self.gap_ext_pen: 346 cline += " -GAPEXT=%s" % self.gap_ext_pen 347 if self.is_no_end_pen == 1: 348 cline += " -ENDGAPS" 349 if self.gap_sep_range: 350 cline += " -GAPDIST=%s" % self.gap_sep_range 351 if self.is_no_pgap == 1: 352 cline += " -NOPGAP" 353 if self.is_no_hgap == 1: 354 cline += " -NOHGAP" 355 if len(self.h_gap_residues) != 0: 356 # stick the list of residues together as one big list o' residues 357 residue_list = '' 358 for residue in self.h_gap_residues: 359 residue_list = residue_list + residue 360 cline += " -HGAPRESIDUES=%s" % residue_list 361 if self.max_div: 362 cline += " -MAXDIV=%s" % self.max_div 363 if self.trans_weight: 364 cline += " -TRANSWEIGHT=%s" % self.trans_weight 365 366 return cline
367
368 - def set_output(self, output_file, output_type = None, output_order = None, 369 change_case = None, add_seqnos = None):
370 """Set the output parameters for the command line. 371 """ 372 self.output_file = output_file 373 374 if output_type: 375 output_type = output_type.upper() 376 if output_type not in self.OUTPUT_TYPES: 377 raise ValueError("Invalid output type %s. Valid choices are %s" 378 % (output_type, self.OUTPUT_TYPES)) 379 else: 380 self.output_type = output_type 381 382 if output_order: 383 output_order = output_order.upper() 384 if output_order not in self.OUTPUT_ORDER: 385 raise ValueError("Invalid output order %s. Valid choices are %s" 386 % (output_order, self.OUTPUT_ORDER)) 387 else: 388 self.output_order = output_order 389 390 if change_case: 391 change_case = change_case.upper() 392 if output_type != "GDE": 393 raise ValueError("Change case only valid for GDE output.") 394 elif change_case not in self.CHANGE_CASE: 395 raise ValueError("Invalid change case %s. Valid choices are %s" 396 % (change_case, self.CHANGE_CASE)) 397 else: 398 self.change_case = change_case 399 400 if add_seqnos: 401 add_seqnos = add_seqnos.upper() 402 if output_type: 403 raise ValueError("Add SeqNos only valid for CLUSTAL output.") 404 elif add_seqnos not in self.OUTPUT_SEQNOS: 405 raise ValueError("Invalid seqnos option %s. Valid choices: %s" 406 % (add_seqnos, self.OUTPUT_SEQNOS)) 407 else: 408 self.add_seqnos = add_seqnos
409
410 - def set_guide_tree(self, tree_file):
411 """Provide a file to use as the guide tree for alignment. 412 413 Raises: 414 o IOError - If the tree_file doesn't exist.""" 415 if not(os.path.exists(tree_file)): 416 raise IOError("Could not find the guide tree file %s." % 417 tree_file) 418 else: 419 self.guide_tree = tree_file
420
421 - def set_new_guide_tree(self, tree_file):
422 """Set the name of the guide tree file generated in the alignment. 423 """ 424 self.new_tree = tree_file
425
426 - def set_protein_matrix(self, protein_matrix):
427 """Set the type of protein matrix to use. 428 429 Protein matrix can be either one of the defined types (blosum, pam, 430 gonnet or id) or a file with your own defined matrix. 431 """ 432 if protein_matrix.upper() in self.PROTEIN_MATRIX: 433 self.protein_matrix = protein_matrix.upper() 434 elif os.path.exists(protein_matrix): 435 self.protein_matrix = protein_matrix 436 else: 437 raise ValueError("Invalid matrix %s. Options are %s or a file." % 438 (protein_matrix.upper(), self.PROTEIN_MATRIX))
439
440 - def set_dna_matrix(self, dna_matrix):
441 """Set the type of DNA matrix to use. 442 443 The dna_matrix can either be one of the defined types (iub or clustalw) 444 or a file with the matrix to use.""" 445 if dna_matrix.upper() in self.DNA_MATRIX: 446 self.dna_matrix = dna_matrix.upper() 447 elif os.path.exists(dna_matrix): 448 self.dna_matrix = dna_matrix 449 else: 450 raise ValueError("Invalid matrix %s. Options are %s or a file." % 451 (dna_matrix, self.DNA_MATRIX))
452
453 - def set_type(self, residue_type):
454 """Set the type of residues within the file. 455 456 Clustal tries to guess whether the info is protein or DNA based on 457 the number of GATCs, but this can be wrong if you have a messed up 458 protein or DNA you are working with, so this allows you to set it 459 explicitly. 460 """ 461 residue_type = residue_type.upper() 462 if residue_type in self.RESIDUE_TYPES: 463 self.type = residue_type 464 else: 465 raise ValueError("Invalid residue type %s. Valid choices are %s" 466 % (residue_type, self.RESIDUE_TYPES))
467