Package Bio :: Module pairwise2
[hide private]
[frames] | no frames]

Source Code for Module Bio.pairwise2

  1  # Copyright 2002 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """This package implements pairwise sequence alignment using a dynamic 
  7  programming algorithm. 
  8   
  9  This provides functions to get global and local alignments between two 
 10  sequences.  A global alignment finds the best concordance between all 
 11  characters in two sequences.  A local alignment finds just the 
 12  subsequences that align the best. 
 13   
 14  When doing alignments, you can specify the match score and gap 
 15  penalties.  The match score indicates the compatibility between an 
 16  alignment of two characters in the sequences.  Highly compatible 
 17  characters should be given positive scores, and incompatible ones 
 18  should be given negative scores or 0.  The gap penalties should be 
 19  negative. 
 20   
 21  The names of the alignment functions in this module follow the 
 22  convention 
 23  <alignment type>XX 
 24  where <alignment type> is either "global" or "local" and XX is a 2 
 25  character code indicating the parameters it takes.  The first 
 26  character indicates the parameters for matches (and mismatches), and 
 27  the second indicates the parameters for gap penalties. 
 28   
 29  The match parameters are 
 30  CODE  DESCRIPTION 
 31  x     No parameters.  Identical characters have score of 1, otherwise 0. 
 32  m     A match score is the score of identical chars, otherwise mismatch score. 
 33  d     A dictionary returns the score of any pair of characters. 
 34  c     A callback function returns scores. 
 35   
 36  The gap penalty parameters are 
 37  CODE  DESCRIPTION 
 38  x     No gap penalties. 
 39  s     Same open and extend gap penalties for both sequences. 
 40  d     The sequences have different open and extend gap penalties. 
 41  c     A callback function returns the gap penalties. 
 42   
 43  All the different alignment functions are contained in an object 
 44  "align".  For example: 
 45   
 46      >>> from Bio import pairwise2 
 47      >>> alignments = pairwise2.align.globalxx("ACCGT", "ACG") 
 48   
 49  will return a list of the alignments between the two strings.  The 
 50  parameters of the alignment function depends on the function called. 
 51  Some examples: 
 52   
 53  >>> pairwise2.align.globalxx("ACCGT", "ACG") 
 54      # Find the best global alignment between the two sequences. 
 55      # Identical characters are given 1 point.  No points are deducted 
 56      # for mismatches or gaps. 
 57       
 58  >>> pairwise2.align.localxx("ACCGT", "ACG") 
 59      # Same thing as before, but with a local alignment. 
 60       
 61  >>> pairwise2.align.globalmx("ACCGT", "ACG", 2, -1) 
 62      # Do a global alignment.  Identical characters are given 2 points, 
 63      # 1 point is deducted for each non-identical character. 
 64   
 65  >>> pairwise2.align.globalms("ACCGT", "ACG", 2, -1, -.5, -.1) 
 66      # Same as above, except now 0.5 points are deducted when opening a 
 67      # gap, and 0.1 points are deducted when extending it. 
 68   
 69   
 70  To see a description of the parameters for a function, please look at 
 71  the docstring for the function. 
 72   
 73  >>> print newalign.align.localds.__doc__ 
 74  localds(sequenceA, sequenceB, match_dict, open, extend) -> alignments 
 75   
 76  """ 
 77  # The alignment functions take some undocumented keyword parameters: 
 78  # - penalize_extend_when_opening: boolean 
 79  #   Whether to count an extension penalty when opening a gap.  If 
 80  #   false, a gap of 1 is only penalize an "open" penalty, otherwise it 
 81  #   is penalized "open+extend". 
 82  # - penalize_end_gaps: boolean 
 83  #   Whether to count the gaps at the ends of an alignment.  By 
 84  #   default, they are counted for global alignments but not for local 
 85  #   ones. 
 86  # - gap_char: string 
 87  #   Which character to use as a gap character in the alignment 
 88  #   returned.  By default, uses '-'. 
 89  # - force_generic: boolean 
 90  #   Always use the generic, non-cached, dynamic programming function. 
 91  #   For debugging. 
 92  # - score_only: boolean 
 93  #   Only get the best score, don't recover any alignments.  The return 
 94  #   value of the function is the score. 
 95  # - one_alignment_only: boolean 
 96  #   Only recover one alignment. 
 97   
 98  from types import * 
 99   
100  from Bio import listfns 
101   
102  MAX_ALIGNMENTS = 1000   # maximum alignments recovered in traceback 
103   
104 -class align:
105 """This class provides functions that do alignments.""" 106
107 - class alignment_function:
108 """This class is callable impersonates an alignment function. 109 The constructor takes the name of the function. This class 110 will decode the name of the function to figure out how to 111 interpret the parameters. 112 113 """ 114 # match code -> tuple of (parameters, docstring) 115 match2args = { 116 'x' : ([], ''), 117 'm' : (['match', 'mismatch'], 118 """match is the score to given to identical characters. mismatch is 119 the score given to non-identical ones."""), 120 'd' : (['match_dict'], 121 """match_dict is a dictionary where the keys are tuples of pairs of 122 characters and the values are the scores, e.g. ("A", "C") : 2.5."""), 123 'c' : (['match_fn'], 124 """match_fn is a callback function that takes two characters and 125 returns the score between them."""), 126 } 127 # penalty code -> tuple of (parameters, docstring) 128 penalty2args = { 129 'x' : ([], ''), 130 's' : (['open', 'extend'], 131 """open and extend are the gap penalties when a gap is opened and 132 extended. They should be negative."""), 133 'd' : (['openA', 'extendA', 'openB', 'extendB'], 134 """openA and extendA are the gap penalties for sequenceA, and openB 135 and extendB for sequeneB. The penalties should be negative."""), 136 'c' : (['gap_A_fn', 'gap_B_fn'], 137 """gap_A_fn and gap_B_fn are callback functions that takes 1) the 138 index where the gap is opened, and 2) the length of the gap. They 139 should return a gap penalty."""), 140 } 141
142 - def __init__(self, name):
143 # Check to make sure the name of the function is 144 # reasonable. 145 if name.startswith("global"): 146 if len(name) != 8: 147 raise AttributeError("function should be globalXX") 148 elif name.startswith("local"): 149 if len(name) != 7: 150 raise AttributeError("function should be localXX") 151 else: 152 raise AttributeError(name) 153 align_type, match_type, penalty_type = \ 154 name[:-2], name[-2], name[-1] 155 try: 156 match_args, match_doc = self.match2args[match_type] 157 except KeyError, x: 158 raise AttributeError("unknown match type %r" % match_type) 159 try: 160 penalty_args, penalty_doc = self.penalty2args[penalty_type] 161 except KeyError, x: 162 raise AttributeError("unknown penalty type %r" % penalty_type) 163 164 # Now get the names of the parameters to this function. 165 param_names = ['sequenceA', 'sequenceB'] 166 param_names.extend(match_args) 167 param_names.extend(penalty_args) 168 self.function_name = name 169 self.align_type = align_type 170 self.param_names = param_names 171 172 self.__name__ = self.function_name 173 # Set the doc string. 174 doc = "%s(%s) -> alignments\n" % ( 175 self.__name__, ', '.join(self.param_names)) 176 if match_doc: 177 doc += "\n%s\n" % match_doc 178 if penalty_doc: 179 doc += "\n%s\n" % penalty_doc 180 doc += ( 181 """\nalignments is a list of tuples (seqA, seqB, score, begin, end). 182 seqA and seqB are strings showing the alignment between the 183 sequences. score is the score of the alignment. begin and end 184 are indexes into seqA and seqB that indicate the where the 185 alignment occurs. 186 """) 187 self.__doc__ = doc
188
189 - def decode(self, *args, **keywds):
190 # Decode the arguments for the _align function. keywds 191 # will get passed to it, so translate the arguments to 192 # this function into forms appropriate for _align. 193 keywds = keywds.copy() 194 if len(args) != len(self.param_names): 195 raise TypeError("%s takes exactly %d argument (%d given)" \ 196 % (self.function_name, len(self.param_names), len(args))) 197 i = 0 198 while i < len(self.param_names): 199 if self.param_names[i] in [ 200 'sequenceA', 'sequenceB', 201 'gap_A_fn', 'gap_B_fn', 'match_fn']: 202 keywds[self.param_names[i]] = args[i] 203 i += 1 204 elif self.param_names[i] == 'match': 205 assert self.param_names[i+1] == 'mismatch' 206 match, mismatch = args[i], args[i+1] 207 keywds['match_fn'] = identity_match(match, mismatch) 208 i += 2 209 elif self.param_names[i] == 'match_dict': 210 keywds['match_fn'] = dictionary_match(args[i]) 211 i += 1 212 elif self.param_names[i] == 'open': 213 assert self.param_names[i+1] == 'extend' 214 open, extend = args[i], args[i+1] 215 pe = keywds.get('penalize_extend_when_opening', 0) 216 keywds['gap_A_fn'] = affine_penalty(open, extend, pe) 217 keywds['gap_B_fn'] = affine_penalty(open, extend, pe) 218 i += 2 219 elif self.param_names[i] == 'openA': 220 assert self.param_names[i+3] == 'extendB' 221 openA, extendA, openB, extendB = args[i:i+4] 222 pe = keywds.get('penalize_extend_when_opening', 0) 223 keywds['gap_A_fn'] = affine_penalty(openA, extendA, pe) 224 keywds['gap_B_fn'] = affine_penalty(openB, extendB, pe) 225 i += 4 226 else: 227 raise ValueError("unknown parameter %r" \ 228 % self.param_names[i]) 229 230 # Here are the default parameters for _align. Assign 231 # these to keywds, unless already specified. 232 pe = keywds.get('penalize_extend_when_opening', 0) 233 default_params = [ 234 ('match_fn', identity_match(1, 0)), 235 ('gap_A_fn', affine_penalty(0, 0, pe)), 236 ('gap_B_fn', affine_penalty(0, 0, pe)), 237 ('penalize_extend_when_opening', 0), 238 ('penalize_end_gaps', self.align_type == 'global'), 239 ('align_globally', self.align_type == 'global'), 240 ('gap_char', '-'), 241 ('force_generic', 0), 242 ('score_only', 0), 243 ('one_alignment_only', 0) 244 ] 245 for name, default in default_params: 246 keywds[name] = keywds.get(name, default) 247 return keywds
248
249 - def __call__(self, *args, **keywds):
250 keywds = self.decode(*args, **keywds) 251 return _align(**keywds)
252
253 - def __getattr__(self, attr):
254 return self.alignment_function(attr)
255 align = align() 256 257
258 -def _align(sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 259 penalize_extend_when_opening, penalize_end_gaps, 260 align_globally, gap_char, force_generic, score_only, 261 one_alignment_only):
262 if not sequenceA or not sequenceB: 263 return [] 264 265 if (not force_generic) and \ 266 type(gap_A_fn) is InstanceType and \ 267 gap_A_fn.__class__ is affine_penalty and \ 268 type(gap_B_fn) is InstanceType and \ 269 gap_B_fn.__class__ is affine_penalty: 270 open_A, extend_A = gap_A_fn.open, gap_A_fn.extend 271 open_B, extend_B = gap_B_fn.open, gap_B_fn.extend 272 x = _make_score_matrix_fast( 273 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B, 274 penalize_extend_when_opening, penalize_end_gaps, align_globally, 275 score_only) 276 else: 277 x = _make_score_matrix_generic( 278 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 279 penalize_extend_when_opening, penalize_end_gaps, align_globally, 280 score_only) 281 score_matrix, trace_matrix = x 282 283 #print "SCORE"; print_matrix(score_matrix) 284 #print "TRACEBACK"; print_matrix(trace_matrix) 285 286 # Look for the proper starting point. Get a list of all possible 287 # starting points. 288 starts = _find_start( 289 score_matrix, sequenceA, sequenceB, 290 gap_A_fn, gap_B_fn, penalize_end_gaps, align_globally) 291 # Find the highest score. 292 best_score = max([x[0] for x in starts]) 293 294 # If they only want the score, then return it. 295 if score_only: 296 return best_score 297 298 tolerance = 0 # XXX do anything with this? 299 # Now find all the positions within some tolerance of the best 300 # score. 301 i = 0 302 while i < len(starts): 303 score, pos = starts[i] 304 if rint(abs(score-best_score)) > rint(tolerance): 305 del starts[i] 306 else: 307 i += 1 308 309 # Recover the alignments and return them. 310 x = _recover_alignments( 311 sequenceA, sequenceB, starts, score_matrix, trace_matrix, 312 align_globally, penalize_end_gaps, gap_char, one_alignment_only) 313 return x
314
315 -def _make_score_matrix_generic( 316 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 317 penalize_extend_when_opening, penalize_end_gaps, align_globally, 318 score_only):
319 # This is an implementation of the Needleman-Wunsch dynamic 320 # programming algorithm for aligning sequences. 321 322 # Create the score and traceback matrices. These should be in the 323 # shape: 324 # sequenceA (down) x sequenceB (across) 325 lenA, lenB = len(sequenceA), len(sequenceB) 326 score_matrix, trace_matrix = [], [] 327 for i in range(lenA): 328 score_matrix.append([None] * lenB) 329 trace_matrix.append([[None]] * lenB) 330 331 # The top and left borders of the matrices are special cases 332 # because there are no previously aligned characters. To simplify 333 # the main loop, handle these separately. 334 for i in range(lenA): 335 # Align the first residue in sequenceB to the ith residue in 336 # sequence A. This is like opening up i gaps at the beginning 337 # of sequence B. 338 score = match_fn(sequenceA[i], sequenceB[0]) 339 if penalize_end_gaps: 340 score += gap_B_fn(0, i) 341 score_matrix[i][0] = score 342 for i in range(1, lenB): 343 score = match_fn(sequenceA[0], sequenceB[i]) 344 if penalize_end_gaps: 345 score += gap_A_fn(0, i) 346 score_matrix[0][i] = score 347 348 # Fill in the score matrix. Each position in the matrix 349 # represents an alignment between a character from sequenceA to 350 # one in sequence B. As I iterate through the matrix, find the 351 # alignment by choose the best of: 352 # 1) extending a previous alignment without gaps 353 # 2) adding a gap in sequenceA 354 # 3) adding a gap in sequenceB 355 for row in range(1, lenA): 356 for col in range(1, lenB): 357 # First, calculate the score that would occur by extending 358 # the alignment without gaps. 359 best_score = score_matrix[row-1][col-1] 360 best_score_rint = rint(best_score) 361 best_indexes = [(row-1, col-1)] 362 363 # Try to find a better score by opening gaps in sequenceA. 364 # Do this by checking alignments from each column in the 365 # previous row. Each column represents a different 366 # character to align from, and thus a different length 367 # gap. 368 for i in range(0, col-1): 369 score = score_matrix[row-1][i] + gap_A_fn(i, col-1-i) 370 score_rint = rint(score) 371 if score_rint == best_score_rint: 372 best_score, best_score_rint = score, score_rint 373 best_indexes.append((row-1, i)) 374 elif score_rint > best_score_rint: 375 best_score, best_score_rint = score, score_rint 376 best_indexes = [(row-1, i)] 377 378 # Try to find a better score by opening gaps in sequenceB. 379 for i in range(0, row-1): 380 score = score_matrix[i][col-1] + gap_B_fn(i, row-1-i) 381 score_rint = rint(score) 382 if score_rint == best_score_rint: 383 best_score, best_score_rint = score, score_rint 384 best_indexes.append((i, col-1)) 385 elif score_rint > best_score_rint: 386 best_score, best_score_rint = score, score_rint 387 best_indexes = [(i, col-1)] 388 389 score_matrix[row][col] = best_score + \ 390 match_fn(sequenceA[row], sequenceB[col]) 391 if not align_globally and score_matrix[row][col] < 0: 392 score_matrix[row][col] = 0 393 trace_matrix[row][col] = best_indexes 394 return score_matrix, trace_matrix
395
396 -def _make_score_matrix_fast( 397 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B, 398 penalize_extend_when_opening, penalize_end_gaps, 399 align_globally, score_only):
400 first_A_gap = calc_affine_penalty(1, open_A, extend_A, 401 penalize_extend_when_opening) 402 first_B_gap = calc_affine_penalty(1, open_B, extend_B, 403 penalize_extend_when_opening) 404 405 # Create the score and traceback matrices. These should be in the 406 # shape: 407 # sequenceA (down) x sequenceB (across) 408 lenA, lenB = len(sequenceA), len(sequenceB) 409 score_matrix, trace_matrix = [], [] 410 for i in range(lenA): 411 score_matrix.append([None] * lenB) 412 trace_matrix.append([[None]] * lenB) 413 414 # The top and left borders of the matrices are special cases 415 # because there are no previously aligned characters. To simplify 416 # the main loop, handle these separately. 417 for i in range(lenA): 418 # Align the first residue in sequenceB to the ith residue in 419 # sequence A. This is like opening up i gaps at the beginning 420 # of sequence B. 421 score = match_fn(sequenceA[i], sequenceB[0]) 422 if penalize_end_gaps: 423 score += calc_affine_penalty( 424 i, open_B, extend_B, penalize_extend_when_opening) 425 score_matrix[i][0] = score 426 for i in range(1, lenB): 427 score = match_fn(sequenceA[0], sequenceB[i]) 428 if penalize_end_gaps: 429 score += calc_affine_penalty( 430 i, open_A, extend_A, penalize_extend_when_opening) 431 score_matrix[0][i] = score 432 433 # In the generic algorithm, at each row and column in the score 434 # matrix, we had to scan all previous rows and columns to see 435 # whether opening a gap might yield a higher score. Here, since 436 # we know the penalties are affine, we can cache just the best 437 # score in the previous rows and columns. Instead of scanning 438 # through all the previous rows and cols, we can just look at the 439 # cache for the best one. Whenever the row or col increments, the 440 # best cached score just decreases by extending the gap longer. 441 442 # The best score and indexes for each row (goes down all columns). 443 # I don't need to store the last row because it's the end of the 444 # sequence. 445 row_cache_score, row_cache_index = [None]*(lenA-1), [None]*(lenA-1) 446 # The best score and indexes for each column (goes across rows). 447 col_cache_score, col_cache_index = [None]*(lenB-1), [None]*(lenB-1) 448 449 for i in range(lenA-1): 450 # Initialize each row to be the alignment of sequenceA[i] to 451 # sequenceB[0], plus opening a gap in sequenceA. 452 row_cache_score[i] = score_matrix[i][0] + first_A_gap 453 row_cache_index[i] = [(i, 0)] 454 for i in range(lenB-1): 455 col_cache_score[i] = score_matrix[0][i] + first_B_gap 456 col_cache_index[i] = [(0, i)] 457 458 # Fill in the score_matrix. 459 for row in range(1, lenA): 460 for col in range(1, lenB): 461 # Calculate the score that would occur by extending the 462 # alignment without gaps. 463 nogap_score = score_matrix[row-1][col-1] 464 465 # Check the score that would occur if there were a gap in 466 # sequence A. 467 if col > 1: 468 row_score = row_cache_score[row-1] 469 else: 470 row_score = nogap_score - 1 # Make sure it's not the best. 471 # Check the score that would occur if there were a gap in 472 # sequence B. 473 if row > 1: 474 col_score = col_cache_score[col-1] 475 else: 476 col_score = nogap_score - 1 477 478 best_score = max(nogap_score, row_score, col_score) 479 best_score_rint = rint(best_score) 480 best_index = [] 481 if best_score_rint == rint(nogap_score): 482 best_index.append((row-1, col-1)) 483 if best_score_rint == rint(row_score): 484 best_index.extend(row_cache_index[row-1]) 485 if best_score_rint == rint(col_score): 486 best_index.extend(col_cache_index[col-1]) 487 488 # Set the score and traceback matrices. 489 score = best_score + match_fn(sequenceA[row], sequenceB[col]) 490 if not align_globally and score < 0: 491 score_matrix[row][col] = 0 492 else: 493 score_matrix[row][col] = score 494 trace_matrix[row][col] = best_index 495 496 # Update the cached column scores. The best score for 497 # this can come from either extending the gap in the 498 # previous cached score, or opening a new gap from the 499 # most previously seen character. Compare the two scores 500 # and keep the best one. 501 open_score = score_matrix[row-1][col-1] + first_B_gap 502 extend_score = col_cache_score[col-1] + extend_B 503 open_score_rint, extend_score_rint = \ 504 rint(open_score), rint(extend_score) 505 if open_score_rint > extend_score_rint: 506 col_cache_score[col-1] = open_score 507 col_cache_index[col-1] = [(row-1, col-1)] 508 elif extend_score_rint > open_score_rint: 509 col_cache_score[col-1] = extend_score 510 else: 511 col_cache_score[col-1] = open_score 512 if (row-1, col-1) not in col_cache_index[col-1]: 513 col_cache_index[col-1] = col_cache_index[col-1] + \ 514 [(row-1, col-1)] 515 516 # Update the cached row scores. 517 open_score = score_matrix[row-1][col-1] + first_A_gap 518 extend_score = row_cache_score[row-1] + extend_A 519 open_score_rint, extend_score_rint = \ 520 rint(open_score), rint(extend_score) 521 if open_score_rint > extend_score_rint: 522 row_cache_score[row-1] = open_score 523 row_cache_index[row-1] = [(row-1, col-1)] 524 elif extend_score_rint > open_score_rint: 525 row_cache_score[row-1] = extend_score 526 else: 527 row_cache_score[row-1] = open_score 528 if (row-1, col-1) not in row_cache_index[row-1]: 529 row_cache_index[row-1] = row_cache_index[row-1] + \ 530 [(row-1, col-1)] 531 532 return score_matrix, trace_matrix
533
534 -def _recover_alignments(sequenceA, sequenceB, starts, 535 score_matrix, trace_matrix, align_globally, 536 penalize_end_gaps, gap_char, one_alignment_only):
537 # Recover the alignments by following the traceback matrix. This 538 # is a recursive procedure, but it's implemented here iteratively 539 # with a stack. 540 lenA, lenB = len(sequenceA), len(sequenceB) 541 tracebacks = [] # list of (seq1, seq2, score, begin, end) 542 in_process = [] # list of ([same as tracebacks], prev_pos, next_pos) 543 544 # sequenceA and sequenceB may be sequences, including strings, 545 # lists, or list-like objects. In order to preserve the type of 546 # the object, we need to use slices on the sequences instead of 547 # indexes. For example, sequenceA[row] may return a type that's 548 # not compatible with sequenceA, e.g. if sequenceA is a list and 549 # sequenceA[row] is a string. Thus, avoid using indexes and use 550 # slices, e.g. sequenceA[row:row+1]. Assume that client-defined 551 # sequence classes preserve these semantics. 552 553 # Initialize the in_process stack 554 for score, (row, col) in starts: 555 if align_globally: 556 begin, end = None, None 557 else: 558 begin, end = None, -max(lenA-row, lenB-col)+1 559 if not end: 560 end = None 561 # Initialize the in_process list with empty sequences of the 562 # same type as sequenceA. To do this, take empty slices of 563 # the sequences. 564 in_process.append( 565 (sequenceA[0:0], sequenceB[0:0], score, begin, end, 566 (lenA, lenB), (row, col))) 567 if one_alignment_only: 568 break 569 while in_process and len(tracebacks) < MAX_ALIGNMENTS: 570 seqA, seqB, score, begin, end, prev_pos, next_pos = in_process.pop() 571 prevA, prevB = prev_pos 572 if next_pos is None: 573 prevlen = len(seqA) 574 # add the rest of the sequences 575 seqA = sequenceA[:prevA] + seqA 576 seqB = sequenceB[:prevB] + seqB 577 # add the rest of the gaps 578 seqA, seqB = _lpad_until_equal(seqA, seqB, gap_char) 579 580 # Now make sure begin is set. 581 if begin is None: 582 if align_globally: 583 begin = 0 584 else: 585 begin = len(seqA) - prevlen 586 tracebacks.append((seqA, seqB, score, begin, end)) 587 else: 588 nextA, nextB = next_pos 589 nseqA, nseqB = prevA-nextA, prevB-nextB 590 maxseq = max(nseqA, nseqB) 591 ngapA, ngapB = maxseq-nseqA, maxseq-nseqB 592 seqA = sequenceA[nextA:nextA+nseqA] + gap_char*ngapA + seqA 593 seqB = sequenceB[nextB:nextB+nseqB] + gap_char*ngapB + seqB 594 prev_pos = next_pos 595 # local alignment stops early if score falls < 0 596 if not align_globally and score_matrix[nextA][nextB] <= 0: 597 begin = max(prevA, prevB) 598 in_process.append( 599 (seqA, seqB, score, begin, end, prev_pos, None)) 600 else: 601 for next_pos in trace_matrix[nextA][nextB]: 602 in_process.append( 603 (seqA, seqB, score, begin, end, prev_pos, next_pos)) 604 if one_alignment_only: 605 break 606 607 return _clean_alignments(tracebacks)
608
609 -def _find_start(score_matrix, sequenceA, sequenceB, gap_A_fn, gap_B_fn, 610 penalize_end_gaps, align_globally):
611 # Return a list of (score, (row, col)) indicating every possible 612 # place to start the tracebacks. 613 if align_globally: 614 if penalize_end_gaps: 615 starts = _find_global_start( 616 sequenceA, sequenceB, score_matrix, gap_A_fn, gap_B_fn, 1) 617 else: 618 starts = _find_global_start( 619 sequenceA, sequenceB, score_matrix, None, None, 0) 620 else: 621 starts = _find_local_start(score_matrix) 622 return starts
623
624 -def _find_global_start(sequenceA, sequenceB, 625 score_matrix, gap_A_fn, gap_B_fn, penalize_end_gaps):
626 # The whole sequence should be aligned, so return the positions at 627 # the end of either one of the sequences. 628 nrows, ncols = len(score_matrix), len(score_matrix[0]) 629 positions = [] 630 # Search all rows in the last column. 631 for row in range(nrows): 632 # Find the score, penalizing end gaps if necessary. 633 score = score_matrix[row][ncols-1] 634 if penalize_end_gaps: 635 score += gap_B_fn(ncols, nrows-row-1) 636 positions.append((score, (row, ncols-1))) 637 # Search all columns in the last row. 638 for col in range(ncols-1): 639 score = score_matrix[nrows-1][col] 640 if penalize_end_gaps: 641 score += gap_A_fn(nrows, ncols-col-1) 642 positions.append((score, (nrows-1, col))) 643 return positions
644
645 -def _find_local_start(score_matrix):
646 # Return every position in the matrix. 647 positions = [] 648 nrows, ncols = len(score_matrix), len(score_matrix[0]) 649 for row in range(nrows): 650 for col in range(ncols): 651 score = score_matrix[row][col] 652 positions.append((score, (row, col))) 653 return positions
654
655 -def _clean_alignments(alignments):
656 # Take a list of alignments and return a cleaned version. Remove 657 # duplicates, make sure begin and end are set correctly, remove 658 # empty alignments. 659 alignments = listfns.items(alignments) # Get rid of duplicates 660 i = 0 661 while i < len(alignments): 662 seqA, seqB, score, begin, end = alignments[i] 663 # Make sure end is set reasonably. 664 if end is None: # global alignment 665 end = len(seqA) 666 elif end < 0: 667 end = end + len(seqA) 668 # If there's no alignment here, get rid of it. 669 if begin >= end: 670 del alignments[i] 671 continue 672 alignments[i] = seqA, seqB, score, begin, end 673 i += 1 674 return alignments
675
676 -def _pad_until_equal(s1, s2, char):
677 # Add char to the end of s1 or s2 until they are equal length. 678 ls1, ls2 = len(s1), len(s2) 679 if ls1 < ls2: 680 s1 = _pad(s1, char, ls2-ls1) 681 elif ls2 < ls1: 682 s2 = _pad(s2, char, ls1-ls2) 683 return s1, s2
684
685 -def _lpad_until_equal(s1, s2, char):
686 # Add char to the beginning of s1 or s2 until they are equal 687 # length. 688 ls1, ls2 = len(s1), len(s2) 689 if ls1 < ls2: 690 s1 = _lpad(s1, char, ls2-ls1) 691 elif ls2 < ls1: 692 s2 = _lpad(s2, char, ls1-ls2) 693 return s1, s2
694
695 -def _pad(s, char, n):
696 # Append n chars to the end of s. 697 return s + char*n
698
699 -def _lpad(s, char, n):
700 # Prepend n chars to the beginning of s. 701 return char*n + s
702 703 _PRECISION = 1000
704 -def rint(x, precision=_PRECISION):
705 return int(x * precision + 0.5)
706
707 -class identity_match:
708 """identity_match([match][, mismatch]) -> match_fn 709 710 Create a match function for use in an alignment. match and 711 mismatch are the scores to give when two residues are equal or 712 unequal. By default, match is 1 and mismatch is 0. 713 714 """
715 - def __init__(self, match=1, mismatch=0):
716 self.match = match 717 self.mismatch = mismatch
718 - def __call__(self, charA, charB):
719 if charA == charB: 720 return self.match 721 return self.mismatch
722
723 -class dictionary_match:
724 """dictionary_match(score_dict[, symmetric]) -> match_fn 725 726 Create a match function for use in an alignment. score_dict is a 727 dictionary where the keys are tuples (residue 1, residue 2) and 728 the values are the match scores between those residues. symmetric 729 is a flag that indicates whether the scores are symmetric. If 730 true, then if (res 1, res 2) doesn't exist, I will use the score 731 at (res 2, res 1). 732 733 """
734 - def __init__(self, score_dict, symmetric=1):
735 self.score_dict = score_dict 736 self.symmetric = symmetric
737 - def __call__(self, charA, charB):
738 if self.symmetric and (charA, charB) not in self.score_dict: 739 # If the score dictionary is symmetric, then look up the 740 # score both ways. 741 charB, charA = charA, charB 742 return self.score_dict[(charA, charB)]
743
744 -class affine_penalty:
745 """affine_penalty(open, extend[, penalize_extend_when_opening]) -> gap_fn 746 747 Create a gap function for use in an alignment. 748 749 """
750 - def __init__(self, open, extend, penalize_extend_when_opening=0):
751 if open > 0 or extend > 0: 752 raise ValueError("Gap penalties should be non-positive.") 753 self.open, self.extend = open, extend 754 self.penalize_extend_when_opening = penalize_extend_when_opening
755 - def __call__(self, index, length):
756 return calc_affine_penalty( 757 length, self.open, self.extend, self.penalize_extend_when_opening)
758
759 -def calc_affine_penalty(length, open, extend, penalize_extend_when_opening):
760 if length <= 0: 761 return 0 762 penalty = open + extend * length 763 if not penalize_extend_when_opening: 764 penalty -= extend 765 return penalty
766 784
785 -def format_alignment(align1, align2, score, begin, end):
786 """format_alignment(align1, align2, score, begin, end) -> string 787 788 Format the alignment prettily into a string. 789 790 """ 791 s = [] 792 s.append("%s\n" % align1) 793 s.append("%s%s\n" % (" "*begin, "|"*(end-begin))) 794 s.append("%s\n" % align2) 795 s.append(" Score=%g\n" % score) 796 return ''.join(s)
797 798 799 # Try and load C implementations of functions. If I can't, 800 # then just ignore and use the pure python implementations. 801 try: 802 import cpairwise2 803 except ImportError: 804 pass 805 else: 806 import sys 807 this_module = sys.modules[__name__] 808 for name in cpairwise2.__dict__.keys(): 809 if not name.startswith("__"): 810 this_module.__dict__[name] = cpairwise2.__dict__[name] 811