Package Bio :: Package Align :: Module AlignInfo
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.AlignInfo

  1  """Extract information from alignment objects. 
  2   
  3  In order to try and avoid huge alignment objects with tons of functions, 
  4  functions which return summary type information about alignments should 
  5  be put into classes in this module. 
  6   
  7  classes: 
  8  o SummaryInfo 
  9  o PSSM 
 10  """ 
 11   
 12  # standard library 
 13  import math 
 14  import sys 
 15   
 16  #TODO - Remove this work around once we drop python 2.3 support 
 17  try: 
 18     set = set 
 19  except NameError: 
 20     from sets import Set as set 
 21   
 22  # biopython modules 
 23  from Bio import Alphabet 
 24  from Bio.Alphabet import IUPAC 
 25  from Bio.Seq import Seq 
 26  from Bio.SubsMat import FreqTable 
 27   
 28  # Expected random distributions for 20-letter protein, and 
 29  # for 4-letter nucleotide alphabets 
 30  Protein20Random = 0.05 
 31  Nucleotide4Random = 0.25 
32 -class SummaryInfo:
33 """Calculate summary info about the alignment. 34 35 This class should be used to caclculate information summarizing the 36 results of an alignment. This may either be straight consensus info 37 or more complicated things. 38 """
39 - def __init__(self, alignment):
40 """Initialize with the alignment to calculate information on. 41 ic_vector attribute. A dictionary. Keys: column numbers. Values: 42 """ 43 self.alignment = alignment 44 self.ic_vector = {}
45
46 - def dumb_consensus(self, threshold = .7, ambiguous = "X", 47 consensus_alpha = None, require_multiple = 0):
48 """Output a fast consensus sequence of the alignment. 49 50 This doesn't do anything fancy at all. It will just go through the 51 sequence residue by residue and count up the number of each type 52 of residue (ie. A or G or T or C for DNA) in all sequences in the 53 alignment. If the percentage of the most common residue type is 54 greater then the passed threshold, then we will add that residue type, 55 otherwise an ambiguous character will be added. 56 57 This could be made a lot fancier (ie. to take a substitution matrix 58 into account), but it just meant for a quick and dirty consensus. 59 60 Arguments: 61 o threshold - The threshold value that is required to add a particular 62 atom. 63 o ambiguous - The ambiguous character to be added when the threshold is 64 not reached. 65 o consensus_alpha - The alphabet to return for the consensus sequence. 66 If this is None, then we will try to guess the alphabet. 67 o require_multiple - If set as 1, this will require that more than 68 1 sequence be part of an alignment to put it in the consensus (ie. 69 not just 1 sequence and gaps). 70 """ 71 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 72 consensus = '' 73 74 # find the length of the consensus we are creating 75 con_len = self.alignment.get_alignment_length() 76 77 # go through each seq item 78 for n in range(con_len): 79 # keep track of the counts of the different atoms we get 80 atom_dict = {} 81 num_atoms = 0 82 83 for record in self.alignment._records: 84 # make sure we haven't run past the end of any sequences 85 # if they are of different lengths 86 if n < len(record.seq): 87 if record.seq[n] != '-' and record.seq[n] != '.': 88 if record.seq[n] not in atom_dict.keys(): 89 atom_dict[record.seq[n]] = 1 90 else: 91 atom_dict[record.seq[n]] = \ 92 atom_dict[record.seq[n]] + 1 93 94 num_atoms = num_atoms + 1 95 96 max_atoms = [] 97 max_size = 0 98 99 for atom in atom_dict.keys(): 100 if atom_dict[atom] > max_size: 101 max_atoms = [atom] 102 max_size = atom_dict[atom] 103 elif atom_dict[atom] == max_size: 104 max_atoms.append(atom) 105 106 if require_multiple and num_atoms == 1: 107 consensus = consensus + ambiguous 108 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 109 >= threshold): 110 consensus = consensus + max_atoms[0] 111 else: 112 consensus = consensus + ambiguous 113 114 # we need to guess a consensus alphabet if one isn't specified 115 if consensus_alpha is None: 116 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 117 118 return Seq(consensus, consensus_alpha)
119
120 - def gap_consensus(self, threshold = .7, ambiguous = "X", 121 consensus_alpha = None, require_multiple = 0):
122 """Same as dumb_consensus(), but allows gap on the output. 123 124 Things to do: Let the user define that with only one gap, the result 125 character in consensus is gap. Let the user select gap character, now 126 it takes the same is input. 127 """ 128 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 129 consensus = '' 130 131 # find the length of the consensus we are creating 132 con_len = self.alignment.get_alignment_length() 133 134 # go through each seq item 135 for n in range(con_len): 136 # keep track of the counts of the different atoms we get 137 atom_dict = {} 138 num_atoms = 0 139 140 for record in self.alignment._records: 141 # make sure we haven't run past the end of any sequences 142 # if they are of different lengths 143 if n < len(record.seq): 144 if record.seq[n] not in atom_dict.keys(): 145 atom_dict[record.seq[n]] = 1 146 else: 147 atom_dict[record.seq[n]] = \ 148 atom_dict[record.seq[n]] + 1 149 150 num_atoms = num_atoms + 1 151 152 max_atoms = [] 153 max_size = 0 154 155 for atom in atom_dict.keys(): 156 if atom_dict[atom] > max_size: 157 max_atoms = [atom] 158 max_size = atom_dict[atom] 159 elif atom_dict[atom] == max_size: 160 max_atoms.append(atom) 161 162 if require_multiple and num_atoms == 1: 163 consensus = consensus + ambiguous 164 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 165 >= threshold): 166 consensus = consensus + max_atoms[0] 167 else: 168 consensus = consensus + ambiguous 169 170 # we need to guess a consensus alphabet if one isn't specified 171 if consensus_alpha is None: 172 #TODO - Should we make this into a Gapped alphabet? 173 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 174 175 return Seq(consensus, consensus_alpha)
176
177 - def _guess_consensus_alphabet(self, ambiguous):
178 """Pick an (ungapped) alphabet for an alignment consesus sequence. 179 180 This just looks at the sequences we have, checks their type, and 181 returns as appropriate type which seems to make sense with the 182 sequences we've got. 183 """ 184 #Start with the (un-gapped version of) the alignment alphabet 185 a = Alphabet._get_base_alphabet(self.alignment._alphabet) 186 187 #Now check its compatible with all the rest of the sequences 188 for record in self.alignment : 189 #Get the (un-gapped version of) the sequence's alphabet 190 alt = Alphabet._get_base_alphabet(record.seq.alphabet) 191 if not isinstance(alt, a.__class__) : 192 raise ValueError \ 193 ("Alignment contains a sequence with an incompatible alphabet.") 194 195 #Check the ambiguous character we are going to use in the consensus 196 #is in the alphabet's list of valid letters (if defined). 197 if hasattr(a, "letters") and a.letters is not None \ 198 and ambiguous not in a.letters : 199 #We'll need to pick a more generic alphabet... 200 if isinstance(a, IUPAC.IUPACUnambiguousDNA) : 201 if ambiguous in IUPAC.IUPACUnambiguousDNA().letters : 202 a = IUPAC.IUPACUnambiguousDNA() 203 else : 204 a = Alphabet.generic_dna 205 elif isinstance(a, IUPAC.IUPACUnambiguousRNA) : 206 if ambiguous in IUPAC.IUPACUnambiguousRNA().letters : 207 a = IUPAC.IUPACUnambiguousRNA() 208 else : 209 a = Alphabet.generic_rna 210 elif isinstance(a, IUPAC.IUPACProtein) : 211 if ambiguous in IUPAC.ExtendedIUPACProtein().letters : 212 a = IUPAC.ExtendedIUPACProtein() 213 else : 214 a = Alphabet.generic_protein 215 else : 216 a = Alphabet.single_letter_alphabet 217 return a
218
219 - def replacement_dictionary(self, skip_chars = []):
220 """Generate a replacement dictionary to plug into a substitution matrix 221 222 This should look at an alignment, and be able to generate the number 223 of substitutions of different residues for each other in the 224 aligned object. 225 226 Will then return a dictionary with this information: 227 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....} 228 229 This also treats weighted sequences. The following example shows how 230 we calculate the replacement dictionary. Given the following 231 multiple sequence alignments: 232 233 GTATC 0.5 234 AT--C 0.8 235 CTGTC 1.0 236 237 For the first column we have: 238 ('A', 'G') : 0.5 * 0.8 = 0.4 239 ('C', 'G') : 0.5 * 1.0 = 0.5 240 ('A', 'C') : 0.8 * 1.0 = 0.8 241 242 We then continue this for all of the columns in the alignment, summing 243 the information for each substitution in each column, until we end 244 up with the replacement dictionary. 245 246 Arguments: 247 o skip_chars - A list of characters to skip when creating the dictionary. 248 For instance, you might have Xs (screened stuff) or Ns, and not want 249 to include the ambiguity characters in the dictionary. 250 """ 251 # get a starting dictionary based on the alphabet of the alignment 252 rep_dict, skip_items = self._get_base_replacements(skip_chars) 253 254 # iterate through each record 255 for rec_num1 in range(len(self.alignment._records)): 256 # iterate through each record from one beyond the current record 257 # to the end of the list of records 258 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)): 259 # for each pair of records, compare the sequences and add 260 # the pertinent info to the dictionary 261 rep_dict = self._pair_replacement( 262 self.alignment._records[rec_num1].seq, 263 self.alignment._records[rec_num2].seq, 264 self.alignment._records[rec_num1].annotations.get('weight',1), 265 self.alignment._records[rec_num2].annotations.get('weight',1), 266 rep_dict, skip_items) 267 268 return rep_dict
269
270 - def _pair_replacement(self, seq1, seq2, weight1, weight2, 271 start_dict, ignore_chars):
272 """Compare two sequences and generate info on the replacements seen. 273 274 Arguments: 275 o seq1, seq2 - The two sequences to compare. 276 o weight1, weight2 - The relative weights of seq1 and seq2. 277 o start_dict - The dictionary containing the starting replacement 278 info that we will modify. 279 o ignore_chars - A list of characters to ignore when calculating 280 replacements (ie. '-'). 281 282 Returns: 283 o A replacment dictionary which is modified from initial_dict with 284 the information from the sequence comparison. 285 """ 286 # loop through each residue in the sequences 287 for residue_num in range(len(seq1)): 288 residue1 = seq1[residue_num] 289 try: 290 residue2 = seq2[residue_num] 291 # if seq2 is shorter, then we just stop looking at replacements 292 # and return the information 293 except IndexError: 294 return start_dict 295 296 # if the two residues are characters we want to count 297 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars): 298 try: 299 # add info about the replacement to the dictionary, 300 # modified by the sequence weights 301 start_dict[(residue1, residue2)] = \ 302 start_dict[(residue1, residue2)] + \ 303 weight1 * weight2 304 305 # if we get a key error, then we've got a problem with alphabets 306 except KeyError: 307 raise ValueError("Residues %s, %s not found in alphabet %s" 308 % (residue1, residue2, 309 self.alignment._alphabet)) 310 311 return start_dict
312 313
314 - def _get_all_letters(self):
315 """Returns a string containing the expected letters in the alignment.""" 316 all_letters = self.alignment._alphabet.letters 317 if all_letters is None \ 318 or (isinstance(self.alignment._alphabet, Alphabet.Gapped) \ 319 and all_letters == self.alignment._alphabet.gap_char): 320 #We are dealing with a generic alphabet class where the 321 #letters are not defined! We must build a list of the 322 #letters used... 323 set_letters = set() 324 for record in self.alignment : 325 #Note the built in set does not have a union_update 326 #which was provided by the sets module's Set 327 set_letters = set_letters.union(record.seq) 328 list_letters = list(set_letters) 329 list_letters.sort() 330 all_letters = "".join(list_letters) 331 return all_letters
332
333 - def _get_base_replacements(self, skip_items = []):
334 """Get a zeroed dictonary of all possible letter combinations. 335 336 This looks at the type of alphabet and gets the letters for it. 337 It then creates a dictionary with all possible combinations of these 338 letters as keys (ie. ('A', 'G')) and sets the values as zero. 339 340 Returns: 341 o The base dictionary created 342 o A list of alphabet items to skip when filling the dictionary.Right 343 now the only thing I can imagine in this list is gap characters, but 344 maybe X's or something else might be useful later. This will also 345 include any characters that are specified to be skipped. 346 """ 347 base_dictionary = {} 348 all_letters = self._get_all_letters() 349 350 # if we have a gapped alphabet we need to find the gap character 351 # and drop it out 352 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 353 skip_items.append(self.alignment._alphabet.gap_char) 354 all_letters = all_letters.replace(self.alignment._alphabet.gap_char,'') 355 356 # now create the dictionary 357 for first_letter in all_letters: 358 for second_letter in all_letters: 359 if (first_letter not in skip_items and 360 second_letter not in skip_items): 361 base_dictionary[(first_letter, second_letter)] = 0 362 363 return base_dictionary, skip_items
364 365
366 - def pos_specific_score_matrix(self, axis_seq = None, 367 chars_to_ignore = []):
368 """Create a position specific score matrix object for the alignment. 369 370 This creates a position specific score matrix (pssm) which is an 371 alternative method to look at a consensus sequence. 372 373 Arguments: 374 o chars_to_ignore - A listing of all characters not to include in 375 the pssm. If the alignment alphabet declares a gap character, 376 then it will be excluded automatically. 377 o axis_seq - An optional argument specifying the sequence to 378 put on the axis of the PSSM. This should be a Seq object. If nothing 379 is specified, the consensus sequence, calculated with default 380 parameters, will be used. 381 382 Returns: 383 o A PSSM (position specific score matrix) object. 384 """ 385 # determine all of the letters we have to deal with 386 all_letters = self._get_all_letters() 387 assert all_letters 388 389 if not isinstance(chars_to_ignore, list) : 390 raise TypeError("chars_to_ignore should be a list.") 391 392 # if we have a gap char, add it to stuff to ignore 393 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 394 chars_to_ignore.append(self.alignment._alphabet.gap_char) 395 396 for char in chars_to_ignore: 397 all_letters = all_letters.replace(char, '') 398 399 if axis_seq: 400 left_seq = axis_seq 401 assert len(axis_seq) == self.alignment.get_alignment_length() 402 else: 403 left_seq = self.dumb_consensus() 404 405 pssm_info = [] 406 # now start looping through all of the sequences and getting info 407 for residue_num in range(len(left_seq)): 408 score_dict = self._get_base_letters(all_letters) 409 for record in self.alignment._records: 410 try: 411 this_residue = record.seq[residue_num] 412 # if we hit an index error we've run out of sequence and 413 # should not add new residues 414 except IndexError: 415 this_residue = None 416 417 if this_residue and this_residue not in chars_to_ignore: 418 weight = record.annotations.get('weight', 1) 419 try: 420 score_dict[this_residue] += weight 421 # if we get a KeyError then we have an alphabet problem 422 except KeyError: 423 raise ValueError("Residue %s not found in alphabet %s" 424 % (this_residue, 425 self.alignment._alphabet)) 426 427 pssm_info.append((left_seq[residue_num], 428 score_dict)) 429 430 431 return PSSM(pssm_info)
432
433 - def _get_base_letters(self, letters):
434 """Create a zeroed dictionary with all of the specified letters. 435 """ 436 base_info = {} 437 for letter in letters: 438 base_info[letter] = 0 439 440 return base_info
441
442 - def information_content(self, start = 0, 443 end = None, 444 e_freq_table = None, log_base = 2, 445 chars_to_ignore = []):
446 """Calculate the information content for each residue along an alignment. 447 448 Arguments: 449 o start, end - The starting an ending points to calculate the 450 information content. These points should be relative to the first 451 sequence in the alignment, starting at zero (ie. even if the 'real' 452 first position in the seq is 203 in the initial sequence, for 453 the info content, we need to use zero). This defaults to the entire 454 length of the first sequence. 455 o e_freq_table - A FreqTable object specifying the expected frequencies 456 for each letter in the alphabet we are using (e.g. {'G' : 0.4, 457 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be 458 included, since these should not have expected frequencies. 459 o log_base - The base of the logathrim to use in calculating the 460 information content. This defaults to 2 so the info is in bits. 461 o chars_to_ignore - A listing of characterw which should be ignored 462 in calculating the info content. 463 464 Returns: 465 o A number representing the info content for the specified region. 466 467 Please see the Biopython manual for more information on how information 468 content is calculated. 469 """ 470 # if no end was specified, then we default to the end of the sequence 471 if end is None: 472 end = len(self.alignment._records[0].seq) 473 474 if start < 0 or end > len(self.alignment._records[0].seq): 475 raise ValueError \ 476 ("Start (%s) and end (%s) are not in the range %s to %s" 477 % (start, end, 0, len(self.alignment._records[0].seq))) 478 # determine random expected frequencies, if necessary 479 random_expected = None 480 if not e_freq_table: 481 #TODO - What about ambiguous alphabets? 482 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet) 483 if isinstance(base_alpha, Alphabet.ProteinAlphabet) : 484 random_expected = Protein20Random 485 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet) : 486 random_expected = Nucleotide4Random 487 else : 488 errstr = "Error in alphabet: not Nucleotide or Protein, " 489 errstr += "supply expected frequencies" 490 raise ValueError(errstr) 491 del base_alpha 492 elif not isinstance(e_freq_table, FreqTable.FreqTable) : 493 raise ValueError("e_freq_table should be a FreqTable object") 494 495 496 # determine all of the letters we have to deal with 497 all_letters = self._get_all_letters() 498 for char in chars_to_ignore: 499 all_letters = all_letters.replace(char, '') 500 501 info_content = {} 502 for residue_num in range(start, end): 503 freq_dict = self._get_letter_freqs(residue_num, 504 self.alignment._records, 505 all_letters, chars_to_ignore) 506 # print freq_dict, 507 column_score = self._get_column_info_content(freq_dict, 508 e_freq_table, 509 log_base, 510 random_expected) 511 512 info_content[residue_num] = column_score 513 # sum up the score 514 total_info = 0 515 for column_info in info_content.values(): 516 total_info = total_info + column_info 517 # fill in the ic_vector member: holds IC for each column 518 for i in info_content.keys(): 519 self.ic_vector[i] = info_content[i] 520 return total_info
521
522 - def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore):
523 """Determine the frequency of specific letters in the alignment. 524 525 Arguments: 526 o residue_num - The number of the column we are getting frequencies 527 from. 528 o all_records - All of the SeqRecords in the alignment. 529 o letters - The letters we are interested in getting the frequency 530 for. 531 o to_ignore - Letters we are specifically supposed to ignore. 532 533 This will calculate the frequencies of each of the specified letters 534 in the alignment at the given frequency, and return this as a 535 dictionary where the keys are the letters and the values are the 536 frequencies. 537 """ 538 freq_info = self._get_base_letters(letters) 539 540 total_count = 0 541 # collect the count info into the dictionary for all the records 542 for record in all_records: 543 try: 544 if record.seq[residue_num] not in to_ignore: 545 weight = record.annotations.get('weight',1) 546 freq_info[record.seq[residue_num]] += weight 547 total_count += weight 548 # getting a key error means we've got a problem with the alphabet 549 except KeyError: 550 raise ValueError("Residue %s not found in alphabet %s" 551 % (record.seq[residue_num], 552 self.alignment._alphabet)) 553 554 if total_count == 0 : 555 # This column must be entirely ignored characters 556 for letter in freq_info.keys(): 557 assert freq_info[letter] == 0 558 #TODO - Map this to NA or NaN? 559 else : 560 # now convert the counts into frequencies 561 for letter in freq_info.keys(): 562 freq_info[letter] = freq_info[letter] / total_count 563 564 return freq_info
565
566 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base, 567 random_expected):
568 """Calculate the information content for a column. 569 570 Arguments: 571 o obs_freq - The frequencies observed for each letter in the column. 572 o e_freq_table - An optional argument specifying the expected 573 frequencies for each letter. This is a SubsMat.FreqTable instance. 574 o log_base - The base of the logathrim to use in calculating the 575 info content. 576 """ 577 try : 578 gap_char = self.alignment._alphabet.gap_char 579 except AttributeError : 580 #The alphabet doesn't declare a gap - there could be none 581 #in the sequence... or just a vague alphabet. 582 gap_char = "-" #Safe? 583 584 if e_freq_table: 585 if not isinstance(e_freq_table, FreqTable.FreqTable) : 586 raise ValueError("e_freq_table should be a FreqTable object") 587 # check the expected freq information to make sure it is good 588 for key in obs_freq.keys(): 589 if (key != gap_char and key not in e_freq_table): 590 raise ValueError("Expected frequency letters %s" + 591 " do not match observed %s" 592 % (e_freq_table.keys(), obs_freq.keys() - 593 [gap_char])) 594 595 total_info = 0.0 596 597 for letter in obs_freq.keys(): 598 inner_log = 0.0 599 # if we have expected frequencies, modify the log value by them 600 # gap characters do not have expected frequencies, so they 601 # should just be the observed frequency. 602 if letter != gap_char: 603 if e_freq_table: 604 inner_log = obs_freq[letter] / e_freq_table[letter] 605 else: 606 inner_log = obs_freq[letter] / random_expected 607 # if the observed frequency is zero, we don't add any info to the 608 # total information content 609 if inner_log > 0: 610 letter_info = (obs_freq[letter] * 611 math.log(inner_log) / math.log(log_base)) 612 total_info = total_info + letter_info 613 return total_info
614
615 - def get_column(self,col):
616 return self.alignment.get_column(col)
617
618 -class PSSM:
619 """Represent a position specific score matrix. 620 621 This class is meant to make it easy to access the info within a PSSM 622 and also make it easy to print out the information in a nice table. 623 624 Let's say you had an alignment like this: 625 GTATC 626 AT--C 627 CTGTC 628 629 The position specific score matrix (when printed) looks like: 630 631 G A T C 632 G 1 1 0 1 633 T 0 0 3 0 634 A 1 1 0 0 635 T 0 0 2 0 636 C 0 0 0 3 637 638 You can access a single element of the PSSM using the following: 639 640 your_pssm[sequence_number][residue_count_name] 641 642 For instance, to get the 'T' residue for the second element in the 643 above alignment you would need to do: 644 645 your_pssm[1]['T'] 646 """
647 - def __init__(self, pssm):
648 """Initialize with pssm data to represent. 649 650 The pssm passed should be a list with the following structure: 651 652 list[0] - The letter of the residue being represented (for instance, 653 from the example above, the first few list[0]s would be GTAT... 654 list[1] - A dictionary with the letter substitutions and counts. 655 """ 656 self.pssm = pssm
657
658 - def __getitem__(self, pos):
659 return self.pssm[pos][1]
660
661 - def __str__(self):
662 out = " " 663 all_residues = self.pssm[0][1].keys() 664 all_residues.sort() 665 666 # first print out the top header 667 for res in all_residues: 668 out = out + " %s" % res 669 out = out + "\n" 670 671 # for each item, write out the substitutions 672 for item in self.pssm: 673 out = out + "%s " % item[0] 674 for res in all_residues: 675 out = out + " %.1f" % item[1][res] 676 677 out = out + "\n" 678 return out
679
680 - def get_residue(self, pos):
681 """Return the residue letter at the specified position. 682 """ 683 return self.pssm[pos][0]
684 685 698 699 if __name__ == "__main__" : 700 print "Quick test" 701 from Bio import AlignIO 702 from Bio.Align.Generic import Alignment 703 704 filename = "../../Tests/GFF/multi.fna" 705 format = "fasta" 706 expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25}, 707 FreqTable.FREQ, 708 IUPAC.unambiguous_dna) 709 710 alignment = AlignIO.read(open(filename), format) 711 for record in alignment : 712 print record.seq.tostring() 713 print "="*alignment.get_alignment_length() 714 715 summary = SummaryInfo(alignment) 716 consensus = summary.dumb_consensus(ambiguous="N") 717 print consensus 718 consensus = summary.gap_consensus(ambiguous="N") 719 print consensus 720 print 721 print summary.pos_specific_score_matrix(chars_to_ignore=['-'], 722 axis_seq=consensus) 723 print 724 #Have a generic alphabet, without a declared gap char, so must tell 725 #provide the frequencies and chars to ignore explicitly. 726 print summary.information_content(e_freq_table=expected, 727 chars_to_ignore=['-']) 728 print 729 print "Trying a protein sequence with gaps and stops" 730 731 alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") 732 a = Alignment(alpha) 733 a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") 734 a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") 735 a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") 736 print a 737 print "="*a.get_alignment_length() 738 739 s = SummaryInfo(a) 740 c = s.dumb_consensus(ambiguous="X") 741 print c 742 c = s.gap_consensus(ambiguous="X") 743 print c 744 print 745 print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c) 746 747 print s.information_content(chars_to_ignore=['-', '*']) 748 749 750 print "Done" 751