1 """Extract information from alignment objects.
2
3 In order to try and avoid huge alignment objects with tons of functions,
4 functions which return summary type information about alignments should
5 be put into classes in this module.
6
7 classes:
8 o SummaryInfo
9 o PSSM
10 """
11
12
13 import math
14 import sys
15
16
17 try:
18 set = set
19 except NameError:
20 from sets import Set as set
21
22
23 from Bio import Alphabet
24 from Bio.Alphabet import IUPAC
25 from Bio.Seq import Seq
26 from Bio.SubsMat import FreqTable
27
28
29
30 Protein20Random = 0.05
31 Nucleotide4Random = 0.25
33 """Calculate summary info about the alignment.
34
35 This class should be used to caclculate information summarizing the
36 results of an alignment. This may either be straight consensus info
37 or more complicated things.
38 """
40 """Initialize with the alignment to calculate information on.
41 ic_vector attribute. A dictionary. Keys: column numbers. Values:
42 """
43 self.alignment = alignment
44 self.ic_vector = {}
45
46 - def dumb_consensus(self, threshold = .7, ambiguous = "X",
47 consensus_alpha = None, require_multiple = 0):
48 """Output a fast consensus sequence of the alignment.
49
50 This doesn't do anything fancy at all. It will just go through the
51 sequence residue by residue and count up the number of each type
52 of residue (ie. A or G or T or C for DNA) in all sequences in the
53 alignment. If the percentage of the most common residue type is
54 greater then the passed threshold, then we will add that residue type,
55 otherwise an ambiguous character will be added.
56
57 This could be made a lot fancier (ie. to take a substitution matrix
58 into account), but it just meant for a quick and dirty consensus.
59
60 Arguments:
61 o threshold - The threshold value that is required to add a particular
62 atom.
63 o ambiguous - The ambiguous character to be added when the threshold is
64 not reached.
65 o consensus_alpha - The alphabet to return for the consensus sequence.
66 If this is None, then we will try to guess the alphabet.
67 o require_multiple - If set as 1, this will require that more than
68 1 sequence be part of an alignment to put it in the consensus (ie.
69 not just 1 sequence and gaps).
70 """
71
72 consensus = ''
73
74
75 con_len = self.alignment.get_alignment_length()
76
77
78 for n in range(con_len):
79
80 atom_dict = {}
81 num_atoms = 0
82
83 for record in self.alignment._records:
84
85
86 if n < len(record.seq):
87 if record.seq[n] != '-' and record.seq[n] != '.':
88 if record.seq[n] not in atom_dict.keys():
89 atom_dict[record.seq[n]] = 1
90 else:
91 atom_dict[record.seq[n]] = \
92 atom_dict[record.seq[n]] + 1
93
94 num_atoms = num_atoms + 1
95
96 max_atoms = []
97 max_size = 0
98
99 for atom in atom_dict.keys():
100 if atom_dict[atom] > max_size:
101 max_atoms = [atom]
102 max_size = atom_dict[atom]
103 elif atom_dict[atom] == max_size:
104 max_atoms.append(atom)
105
106 if require_multiple and num_atoms == 1:
107 consensus = consensus + ambiguous
108 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
109 >= threshold):
110 consensus = consensus + max_atoms[0]
111 else:
112 consensus = consensus + ambiguous
113
114
115 if consensus_alpha is None:
116 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
117
118 return Seq(consensus, consensus_alpha)
119
120 - def gap_consensus(self, threshold = .7, ambiguous = "X",
121 consensus_alpha = None, require_multiple = 0):
122 """Same as dumb_consensus(), but allows gap on the output.
123
124 Things to do: Let the user define that with only one gap, the result
125 character in consensus is gap. Let the user select gap character, now
126 it takes the same is input.
127 """
128
129 consensus = ''
130
131
132 con_len = self.alignment.get_alignment_length()
133
134
135 for n in range(con_len):
136
137 atom_dict = {}
138 num_atoms = 0
139
140 for record in self.alignment._records:
141
142
143 if n < len(record.seq):
144 if record.seq[n] not in atom_dict.keys():
145 atom_dict[record.seq[n]] = 1
146 else:
147 atom_dict[record.seq[n]] = \
148 atom_dict[record.seq[n]] + 1
149
150 num_atoms = num_atoms + 1
151
152 max_atoms = []
153 max_size = 0
154
155 for atom in atom_dict.keys():
156 if atom_dict[atom] > max_size:
157 max_atoms = [atom]
158 max_size = atom_dict[atom]
159 elif atom_dict[atom] == max_size:
160 max_atoms.append(atom)
161
162 if require_multiple and num_atoms == 1:
163 consensus = consensus + ambiguous
164 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
165 >= threshold):
166 consensus = consensus + max_atoms[0]
167 else:
168 consensus = consensus + ambiguous
169
170
171 if consensus_alpha is None:
172
173 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
174
175 return Seq(consensus, consensus_alpha)
176
218
220 """Generate a replacement dictionary to plug into a substitution matrix
221
222 This should look at an alignment, and be able to generate the number
223 of substitutions of different residues for each other in the
224 aligned object.
225
226 Will then return a dictionary with this information:
227 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....}
228
229 This also treats weighted sequences. The following example shows how
230 we calculate the replacement dictionary. Given the following
231 multiple sequence alignments:
232
233 GTATC 0.5
234 AT--C 0.8
235 CTGTC 1.0
236
237 For the first column we have:
238 ('A', 'G') : 0.5 * 0.8 = 0.4
239 ('C', 'G') : 0.5 * 1.0 = 0.5
240 ('A', 'C') : 0.8 * 1.0 = 0.8
241
242 We then continue this for all of the columns in the alignment, summing
243 the information for each substitution in each column, until we end
244 up with the replacement dictionary.
245
246 Arguments:
247 o skip_chars - A list of characters to skip when creating the dictionary.
248 For instance, you might have Xs (screened stuff) or Ns, and not want
249 to include the ambiguity characters in the dictionary.
250 """
251
252 rep_dict, skip_items = self._get_base_replacements(skip_chars)
253
254
255 for rec_num1 in range(len(self.alignment._records)):
256
257
258 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)):
259
260
261 rep_dict = self._pair_replacement(
262 self.alignment._records[rec_num1].seq,
263 self.alignment._records[rec_num2].seq,
264 self.alignment._records[rec_num1].annotations.get('weight',1),
265 self.alignment._records[rec_num2].annotations.get('weight',1),
266 rep_dict, skip_items)
267
268 return rep_dict
269
270 - def _pair_replacement(self, seq1, seq2, weight1, weight2,
271 start_dict, ignore_chars):
272 """Compare two sequences and generate info on the replacements seen.
273
274 Arguments:
275 o seq1, seq2 - The two sequences to compare.
276 o weight1, weight2 - The relative weights of seq1 and seq2.
277 o start_dict - The dictionary containing the starting replacement
278 info that we will modify.
279 o ignore_chars - A list of characters to ignore when calculating
280 replacements (ie. '-').
281
282 Returns:
283 o A replacment dictionary which is modified from initial_dict with
284 the information from the sequence comparison.
285 """
286
287 for residue_num in range(len(seq1)):
288 residue1 = seq1[residue_num]
289 try:
290 residue2 = seq2[residue_num]
291
292
293 except IndexError:
294 return start_dict
295
296
297 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars):
298 try:
299
300
301 start_dict[(residue1, residue2)] = \
302 start_dict[(residue1, residue2)] + \
303 weight1 * weight2
304
305
306 except KeyError:
307 raise ValueError("Residues %s, %s not found in alphabet %s"
308 % (residue1, residue2,
309 self.alignment._alphabet))
310
311 return start_dict
312
313
315 """Returns a string containing the expected letters in the alignment."""
316 all_letters = self.alignment._alphabet.letters
317 if all_letters is None \
318 or (isinstance(self.alignment._alphabet, Alphabet.Gapped) \
319 and all_letters == self.alignment._alphabet.gap_char):
320
321
322
323 set_letters = set()
324 for record in self.alignment :
325
326
327 set_letters = set_letters.union(record.seq)
328 list_letters = list(set_letters)
329 list_letters.sort()
330 all_letters = "".join(list_letters)
331 return all_letters
332
334 """Get a zeroed dictonary of all possible letter combinations.
335
336 This looks at the type of alphabet and gets the letters for it.
337 It then creates a dictionary with all possible combinations of these
338 letters as keys (ie. ('A', 'G')) and sets the values as zero.
339
340 Returns:
341 o The base dictionary created
342 o A list of alphabet items to skip when filling the dictionary.Right
343 now the only thing I can imagine in this list is gap characters, but
344 maybe X's or something else might be useful later. This will also
345 include any characters that are specified to be skipped.
346 """
347 base_dictionary = {}
348 all_letters = self._get_all_letters()
349
350
351
352 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
353 skip_items.append(self.alignment._alphabet.gap_char)
354 all_letters = all_letters.replace(self.alignment._alphabet.gap_char,'')
355
356
357 for first_letter in all_letters:
358 for second_letter in all_letters:
359 if (first_letter not in skip_items and
360 second_letter not in skip_items):
361 base_dictionary[(first_letter, second_letter)] = 0
362
363 return base_dictionary, skip_items
364
365
368 """Create a position specific score matrix object for the alignment.
369
370 This creates a position specific score matrix (pssm) which is an
371 alternative method to look at a consensus sequence.
372
373 Arguments:
374 o chars_to_ignore - A listing of all characters not to include in
375 the pssm. If the alignment alphabet declares a gap character,
376 then it will be excluded automatically.
377 o axis_seq - An optional argument specifying the sequence to
378 put on the axis of the PSSM. This should be a Seq object. If nothing
379 is specified, the consensus sequence, calculated with default
380 parameters, will be used.
381
382 Returns:
383 o A PSSM (position specific score matrix) object.
384 """
385
386 all_letters = self._get_all_letters()
387 assert all_letters
388
389 if not isinstance(chars_to_ignore, list) :
390 raise TypeError("chars_to_ignore should be a list.")
391
392
393 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
394 chars_to_ignore.append(self.alignment._alphabet.gap_char)
395
396 for char in chars_to_ignore:
397 all_letters = all_letters.replace(char, '')
398
399 if axis_seq:
400 left_seq = axis_seq
401 assert len(axis_seq) == self.alignment.get_alignment_length()
402 else:
403 left_seq = self.dumb_consensus()
404
405 pssm_info = []
406
407 for residue_num in range(len(left_seq)):
408 score_dict = self._get_base_letters(all_letters)
409 for record in self.alignment._records:
410 try:
411 this_residue = record.seq[residue_num]
412
413
414 except IndexError:
415 this_residue = None
416
417 if this_residue and this_residue not in chars_to_ignore:
418 weight = record.annotations.get('weight', 1)
419 try:
420 score_dict[this_residue] += weight
421
422 except KeyError:
423 raise ValueError("Residue %s not found in alphabet %s"
424 % (this_residue,
425 self.alignment._alphabet))
426
427 pssm_info.append((left_seq[residue_num],
428 score_dict))
429
430
431 return PSSM(pssm_info)
432
434 """Create a zeroed dictionary with all of the specified letters.
435 """
436 base_info = {}
437 for letter in letters:
438 base_info[letter] = 0
439
440 return base_info
441
442 - def information_content(self, start = 0,
443 end = None,
444 e_freq_table = None, log_base = 2,
445 chars_to_ignore = []):
446 """Calculate the information content for each residue along an alignment.
447
448 Arguments:
449 o start, end - The starting an ending points to calculate the
450 information content. These points should be relative to the first
451 sequence in the alignment, starting at zero (ie. even if the 'real'
452 first position in the seq is 203 in the initial sequence, for
453 the info content, we need to use zero). This defaults to the entire
454 length of the first sequence.
455 o e_freq_table - A FreqTable object specifying the expected frequencies
456 for each letter in the alphabet we are using (e.g. {'G' : 0.4,
457 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
458 included, since these should not have expected frequencies.
459 o log_base - The base of the logathrim to use in calculating the
460 information content. This defaults to 2 so the info is in bits.
461 o chars_to_ignore - A listing of characterw which should be ignored
462 in calculating the info content.
463
464 Returns:
465 o A number representing the info content for the specified region.
466
467 Please see the Biopython manual for more information on how information
468 content is calculated.
469 """
470
471 if end is None:
472 end = len(self.alignment._records[0].seq)
473
474 if start < 0 or end > len(self.alignment._records[0].seq):
475 raise ValueError \
476 ("Start (%s) and end (%s) are not in the range %s to %s"
477 % (start, end, 0, len(self.alignment._records[0].seq)))
478
479 random_expected = None
480 if not e_freq_table:
481
482 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
483 if isinstance(base_alpha, Alphabet.ProteinAlphabet) :
484 random_expected = Protein20Random
485 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet) :
486 random_expected = Nucleotide4Random
487 else :
488 errstr = "Error in alphabet: not Nucleotide or Protein, "
489 errstr += "supply expected frequencies"
490 raise ValueError(errstr)
491 del base_alpha
492 elif not isinstance(e_freq_table, FreqTable.FreqTable) :
493 raise ValueError("e_freq_table should be a FreqTable object")
494
495
496
497 all_letters = self._get_all_letters()
498 for char in chars_to_ignore:
499 all_letters = all_letters.replace(char, '')
500
501 info_content = {}
502 for residue_num in range(start, end):
503 freq_dict = self._get_letter_freqs(residue_num,
504 self.alignment._records,
505 all_letters, chars_to_ignore)
506
507 column_score = self._get_column_info_content(freq_dict,
508 e_freq_table,
509 log_base,
510 random_expected)
511
512 info_content[residue_num] = column_score
513
514 total_info = 0
515 for column_info in info_content.values():
516 total_info = total_info + column_info
517
518 for i in info_content.keys():
519 self.ic_vector[i] = info_content[i]
520 return total_info
521
523 """Determine the frequency of specific letters in the alignment.
524
525 Arguments:
526 o residue_num - The number of the column we are getting frequencies
527 from.
528 o all_records - All of the SeqRecords in the alignment.
529 o letters - The letters we are interested in getting the frequency
530 for.
531 o to_ignore - Letters we are specifically supposed to ignore.
532
533 This will calculate the frequencies of each of the specified letters
534 in the alignment at the given frequency, and return this as a
535 dictionary where the keys are the letters and the values are the
536 frequencies.
537 """
538 freq_info = self._get_base_letters(letters)
539
540 total_count = 0
541
542 for record in all_records:
543 try:
544 if record.seq[residue_num] not in to_ignore:
545 weight = record.annotations.get('weight',1)
546 freq_info[record.seq[residue_num]] += weight
547 total_count += weight
548
549 except KeyError:
550 raise ValueError("Residue %s not found in alphabet %s"
551 % (record.seq[residue_num],
552 self.alignment._alphabet))
553
554 if total_count == 0 :
555
556 for letter in freq_info.keys():
557 assert freq_info[letter] == 0
558
559 else :
560
561 for letter in freq_info.keys():
562 freq_info[letter] = freq_info[letter] / total_count
563
564 return freq_info
565
566 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base,
567 random_expected):
568 """Calculate the information content for a column.
569
570 Arguments:
571 o obs_freq - The frequencies observed for each letter in the column.
572 o e_freq_table - An optional argument specifying the expected
573 frequencies for each letter. This is a SubsMat.FreqTable instance.
574 o log_base - The base of the logathrim to use in calculating the
575 info content.
576 """
577 try :
578 gap_char = self.alignment._alphabet.gap_char
579 except AttributeError :
580
581
582 gap_char = "-"
583
584 if e_freq_table:
585 if not isinstance(e_freq_table, FreqTable.FreqTable) :
586 raise ValueError("e_freq_table should be a FreqTable object")
587
588 for key in obs_freq.keys():
589 if (key != gap_char and key not in e_freq_table):
590 raise ValueError("Expected frequency letters %s" +
591 " do not match observed %s"
592 % (e_freq_table.keys(), obs_freq.keys() -
593 [gap_char]))
594
595 total_info = 0.0
596
597 for letter in obs_freq.keys():
598 inner_log = 0.0
599
600
601
602 if letter != gap_char:
603 if e_freq_table:
604 inner_log = obs_freq[letter] / e_freq_table[letter]
605 else:
606 inner_log = obs_freq[letter] / random_expected
607
608
609 if inner_log > 0:
610 letter_info = (obs_freq[letter] *
611 math.log(inner_log) / math.log(log_base))
612 total_info = total_info + letter_info
613 return total_info
614
617
619 """Represent a position specific score matrix.
620
621 This class is meant to make it easy to access the info within a PSSM
622 and also make it easy to print out the information in a nice table.
623
624 Let's say you had an alignment like this:
625 GTATC
626 AT--C
627 CTGTC
628
629 The position specific score matrix (when printed) looks like:
630
631 G A T C
632 G 1 1 0 1
633 T 0 0 3 0
634 A 1 1 0 0
635 T 0 0 2 0
636 C 0 0 0 3
637
638 You can access a single element of the PSSM using the following:
639
640 your_pssm[sequence_number][residue_count_name]
641
642 For instance, to get the 'T' residue for the second element in the
643 above alignment you would need to do:
644
645 your_pssm[1]['T']
646 """
648 """Initialize with pssm data to represent.
649
650 The pssm passed should be a list with the following structure:
651
652 list[0] - The letter of the residue being represented (for instance,
653 from the example above, the first few list[0]s would be GTAT...
654 list[1] - A dictionary with the letter substitutions and counts.
655 """
656 self.pssm = pssm
657
659 return self.pssm[pos][1]
660
662 out = " "
663 all_residues = self.pssm[0][1].keys()
664 all_residues.sort()
665
666
667 for res in all_residues:
668 out = out + " %s" % res
669 out = out + "\n"
670
671
672 for item in self.pssm:
673 out = out + "%s " % item[0]
674 for res in all_residues:
675 out = out + " %.1f" % item[1][res]
676
677 out = out + "\n"
678 return out
679
681 """Return the residue letter at the specified position.
682 """
683 return self.pssm[pos][0]
684
685
686 -def print_info_content(summary_info,fout=None,rep_record=0):
687 """ Three column output: position, aa in representative sequence,
688 ic_vector value"""
689 fout = fout or sys.stdout
690 if not summary_info.ic_vector:
691 summary_info.information_content()
692 rep_sequence = summary_info.alignment._records[rep_record].seq
693 positions = summary_info.ic_vector.keys()
694 positions.sort()
695 for pos in positions:
696 fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos],
697 summary_info.ic_vector[pos]))
698
699 if __name__ == "__main__" :
700 print "Quick test"
701 from Bio import AlignIO
702 from Bio.Align.Generic import Alignment
703
704 filename = "../../Tests/GFF/multi.fna"
705 format = "fasta"
706 expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25},
707 FreqTable.FREQ,
708 IUPAC.unambiguous_dna)
709
710 alignment = AlignIO.read(open(filename), format)
711 for record in alignment :
712 print record.seq.tostring()
713 print "="*alignment.get_alignment_length()
714
715 summary = SummaryInfo(alignment)
716 consensus = summary.dumb_consensus(ambiguous="N")
717 print consensus
718 consensus = summary.gap_consensus(ambiguous="N")
719 print consensus
720 print
721 print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
722 axis_seq=consensus)
723 print
724
725
726 print summary.information_content(e_freq_table=expected,
727 chars_to_ignore=['-'])
728 print
729 print "Trying a protein sequence with gaps and stops"
730
731 alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
732 a = Alignment(alpha)
733 a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
734 a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
735 a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
736 print a
737 print "="*a.get_alignment_length()
738
739 s = SummaryInfo(a)
740 c = s.dumb_consensus(ambiguous="X")
741 print c
742 c = s.gap_consensus(ambiguous="X")
743 print c
744 print
745 print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
746
747 print s.information_content(chars_to_ignore=['-', '*'])
748
749
750 print "Done"
751