1 """Extract information from alignment objects.
2
3 In order to try and avoid huge alignment objects with tons of functions,
4 functions which return summary type information about alignments should
5 be put into classes in this module.
6
7 classes:
8 o SummaryInfo
9 o PSSM
10 """
11
12
13
14 import string
15 import math
16 import sys
17
18
19 from Bio import Alphabet
20 from Bio.Alphabet import IUPAC
21 from Bio.Seq import Seq
22
23
24
25 Protein20Random = 0.05
26 Nucleotide4Random = 0.25
28 """Calculate summary info about the alignment.
29
30 This class should be used to caclculate information summarizing the
31 results of an alignment. This may either be straight consensus info
32 or more complicated things.
33 """
35 """Initialize with the alignment to calculate information on.
36 ic_vector attribute. A dictionary. Keys: column numbers. Values:
37 """
38 self.alignment = alignment
39 self.ic_vector = {}
40
41 - def dumb_consensus(self, threshold = .7, ambiguous = "X",
42 consensus_alpha = None, require_multiple = 0):
43 """Output a fast consensus sequence of the alignment.
44
45 This doesn't do anything fancy at all. It will just go through the
46 sequence residue by residue and count up the number of each type
47 of residue (ie. A or G or T or C for DNA) in all sequences in the
48 alignment. If the percentage of the most common residue type is
49 greater then the passed threshold, then we will add that residue type,
50 otherwise an ambiguous character will be added.
51
52 This could be made a lot fancier (ie. to take a substitution matrix
53 into account), but it just meant for a quick and dirty consensus.
54
55 Arguments:
56 o threshold - The threshold value that is required to add a particular
57 atom.
58 o ambiguous - The ambiguous character to be added when the threshold is
59 not reached.
60 o consensus_alpha - The alphabet to return for the consensus sequence.
61 If this is None, then we will try to guess the alphabet.
62 o require_multiple - If set as 1, this will require that more than
63 1 sequence be part of an alignment to put it in the consensus (ie.
64 not just 1 sequence and gaps).
65 """
66
67 consensus = ''
68
69
70 con_len = self.alignment.get_alignment_length()
71
72
73 for n in range(con_len):
74
75 atom_dict = {}
76 num_atoms = 0
77
78 for record in self.alignment._records:
79
80
81 if n < len(record.seq):
82 if record.seq[n] != '-' and record.seq[n] != '.':
83 if record.seq[n] not in atom_dict.keys():
84 atom_dict[record.seq[n]] = 1
85 else:
86 atom_dict[record.seq[n]] = \
87 atom_dict[record.seq[n]] + 1
88
89 num_atoms = num_atoms + 1
90
91 max_atoms = []
92 max_size = 0
93
94 for atom in atom_dict.keys():
95 if atom_dict[atom] > max_size:
96 max_atoms = [atom]
97 max_size = atom_dict[atom]
98 elif atom_dict[atom] == max_size:
99 max_atoms.append(atom)
100
101 if require_multiple and num_atoms == 1:
102 consensus = consensus + ambiguous
103 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
104 >= threshold):
105 consensus = consensus + max_atoms[0]
106 else:
107 consensus = consensus + ambiguous
108
109
110 if consensus_alpha is None:
111 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
112
113 return Seq(consensus, consensus_alpha)
114
115 - def gap_consensus(self, threshold = .7, ambiguous = "X",
116 consensus_alpha = None, require_multiple = 0):
117 """Same as dumb_consensus(), but allows gap on the output.
118
119 Things to do: Let the user define that with only one gap, the result
120 character in consensus is gap. Let the user select gap character, now
121 it takes the same is input.
122 """
123
124 consensus = ''
125
126
127 con_len = self.alignment.get_alignment_length()
128
129
130 for n in range(con_len):
131
132 atom_dict = {}
133 num_atoms = 0
134
135 for record in self.alignment._records:
136
137
138 if n < len(record.seq):
139 if record.seq[n] not in atom_dict.keys():
140 atom_dict[record.seq[n]] = 1
141 else:
142 atom_dict[record.seq[n]] = \
143 atom_dict[record.seq[n]] + 1
144
145 num_atoms = num_atoms + 1
146
147 max_atoms = []
148 max_size = 0
149
150 for atom in atom_dict.keys():
151 if atom_dict[atom] > max_size:
152 max_atoms = [atom]
153 max_size = atom_dict[atom]
154 elif atom_dict[atom] == max_size:
155 max_atoms.append(atom)
156
157 if require_multiple and num_atoms == 1:
158 consensus = consensus + ambiguous
159 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
160 >= threshold):
161 consensus = consensus + max_atoms[0]
162 else:
163 consensus = consensus + ambiguous
164
165
166 if consensus_alpha is None:
167
168 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
169
170 return Seq(consensus, consensus_alpha)
171
173 """Pick an (ungapped) alphabet for an alignment consesus sequence.
174
175 This just looks at the sequences we have, checks their type, and
176 returns as appropriate type which seems to make sense with the
177 sequences we've got.
178 """
179
180 if isinstance(self.alignment._alphabet, Alphabet.Gapped) :
181 a = self.alignment._alphabet.alphabet
182 elif isinstance(self.alignment._alphabet, Alphabet.Alphabet) :
183 a = self.alignment._alphabet
184 else :
185 raise ValueError \
186 ("Alignment's alphabet property is invalid.")
187
188
189 for record in self.alignment :
190
191 if isinstance(record.seq.alphabet, Alphabet.Gapped) :
192 alt = record.seq.alphabet.alphabet
193 elif isinstance(record.seq.alphabet, Alphabet.Alphabet) :
194 alt = record.seq.alphabet
195 else :
196 raise ValueError \
197 ("Non-alphabet found in first of the alignment sequences.")
198
199 if not isinstance(alt, a.__class__) :
200 raise ValueError \
201 ("Alignment contains a sequence with an incompatible alphabet.")
202
203
204
205 if hasattr(a, "letters") and a.letters is not None \
206 and ambiguous not in a.letters :
207
208 if isinstance(a, IUPAC.IUPACUnambiguousDNA) :
209 if ambiguous in IUPAC.IUPACUnambiguousDNA().letters :
210 a = IUPAC.IUPACUnambiguousDNA()
211 else :
212 a = Alphabet.generic_dna
213 elif isinstance(a, IUPAC.IUPACUnambiguousRNA) :
214 if ambiguous in IUPAC.IUPACUnambiguousRNA().letters :
215 a = IUPAC.IUPACUnambiguousRNA()
216 else :
217 a = Alphabet.generic_rna
218 elif isinstance(a, IUPAC.IUPACProtein) :
219 if ambiguous in IUPAC.ExtendedIUPACProtein().letters :
220 a = IUPAC.ExtendedIUPACProtein()
221 else :
222 a = Alphabet.generic_protein
223 else :
224 a = Alphabet.single_letter_alphabet
225 return a
226
228 """Generate a replacement dictionary to plug into a substitution matrix
229
230 This should look at an alignment, and be able to generate the number
231 of substitutions of different residues for each other in the
232 aligned object.
233
234 Will then return a dictionary with this information:
235 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....}
236
237 This also treats weighted sequences. The following example shows how
238 we calculate the replacement dictionary. Given the following
239 multiple sequence alignments:
240
241 GTATC 0.5
242 AT--C 0.8
243 CTGTC 1.0
244
245 For the first column we have:
246 ('A', 'G') : 0.5 * 0.8 = 0.4
247 ('C', 'G') : 0.5 * 1.0 = 0.5
248 ('A', 'C') : 0.8 * 1.0 = 0.8
249
250 We then continue this for all of the columns in the alignment, summing
251 the information for each substitution in each column, until we end
252 up with the replacement dictionary.
253
254 Arguments:
255 o skip_chars - A list of characters to skip when creating the dictionary.
256 For instance, you might have Xs (screened stuff) or Ns, and not want
257 to include the ambiguity characters in the dictionary.
258 """
259
260 rep_dict, skip_items = self._get_base_replacements(skip_chars)
261
262
263 for rec_num1 in range(len(self.alignment._records)):
264
265
266 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)):
267
268
269 rep_dict = self._pair_replacement(
270 self.alignment._records[rec_num1].seq,
271 self.alignment._records[rec_num2].seq,
272 self.alignment._records[rec_num1].annotations.get('weight',1),
273 self.alignment._records[rec_num2].annotations.get('weight',1),
274 rep_dict, skip_items)
275
276 return rep_dict
277
278 - def _pair_replacement(self, seq1, seq2, weight1, weight2,
279 start_dict, ignore_chars):
280 """Compare two sequences and generate info on the replacements seen.
281
282 Arguments:
283 o seq1, seq2 - The two sequences to compare.
284 o weight1, weight2 - The relative weights of seq1 and seq2.
285 o start_dict - The dictionary containing the starting replacement
286 info that we will modify.
287 o ignore_chars - A list of characters to ignore when calculating
288 replacements (ie. '-').
289
290 Returns:
291 o A replacment dictionary which is modified from initial_dict with
292 the information from the sequence comparison.
293 """
294
295 for residue_num in range(len(seq1)):
296 residue1 = seq1[residue_num]
297 try:
298 residue2 = seq2[residue_num]
299
300
301 except IndexError:
302 return start_dict
303
304
305 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars):
306 try:
307
308
309 start_dict[(residue1, residue2)] = \
310 start_dict[(residue1, residue2)] + \
311 weight1 * weight2
312
313
314 except KeyError:
315 raise ValueError("Residues %s, %s not found in alphabet %s"
316 % (residue1, residue2,
317 self.alignment._alphabet))
318
319 return start_dict
320
321
323 """Returns a string containing the expected letters in the alignment."""
324 all_letters = self.alignment._alphabet.letters
325 if all_letters is None \
326 or (isinstance(self.alignment._alphabet, Alphabet.Gapped) \
327 and all_letters == self.alignment._alphabet.gap_char):
328
329
330
331 from sets import Set
332 set_letters = Set()
333 for record in self.alignment :
334 set_letters.union_update(record.seq)
335 list_letters = list(set_letters)
336 list_letters.sort()
337 all_letters = "".join(list_letters)
338 return all_letters
339
341 """Get a zeroed dictonary of all possible letter combinations.
342
343 This looks at the type of alphabet and gets the letters for it.
344 It then creates a dictionary with all possible combinations of these
345 letters as keys (ie. ('A', 'G')) and sets the values as zero.
346
347 Returns:
348 o The base dictionary created
349 o A list of alphabet items to skip when filling the dictionary.Right
350 now the only thing I can imagine in this list is gap characters, but
351 maybe X's or something else might be useful later. This will also
352 include any characters that are specified to be skipped.
353 """
354 base_dictionary = {}
355 all_letters = self._get_all_letters()
356
357
358
359 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
360 skip_items.append(self.alignment._alphabet.gap_char)
361 all_letters = string.replace(all_letters,
362 self.alignment._alphabet.gap_char,
363 '')
364
365
366 for first_letter in all_letters:
367 for second_letter in all_letters:
368 if (first_letter not in skip_items and
369 second_letter not in skip_items):
370 base_dictionary[(first_letter, second_letter)] = 0
371
372 return base_dictionary, skip_items
373
374
377 """Create a position specific score matrix object for the alignment.
378
379 This creates a position specific score matrix (pssm) which is an
380 alternative method to look at a consensus sequence.
381
382 Arguments:
383 o chars_to_ignore - A listing of all characters not to include in
384 the pssm. If the alignment alphabet declares a gap character,
385 then it will be excluded automatically.
386 o axis_seq - An optional argument specifying the sequence to
387 put on the axis of the PSSM. This should be a Seq object. If nothing
388 is specified, the consensus sequence, calculated with default
389 parameters, will be used.
390
391 Returns:
392 o A PSSM (position specific score matrix) object.
393 """
394
395 all_letters = self._get_all_letters()
396 assert all_letters
397
398 if not isinstance(chars_to_ignore, list) :
399 raise TypeError("chars_to_ignore should be a list.")
400
401
402 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
403 chars_to_ignore.append(self.alignment._alphabet.gap_char)
404
405 for char in chars_to_ignore:
406 all_letters = string.replace(all_letters, char, '')
407
408 if axis_seq:
409 left_seq = axis_seq
410 assert len(axis_seq) == self.alignment.get_alignment_length()
411 else:
412 left_seq = self.dumb_consensus()
413
414 pssm_info = []
415
416 for residue_num in range(len(left_seq)):
417 score_dict = self._get_base_letters(all_letters)
418 for record in self.alignment._records:
419 try:
420 this_residue = record.seq[residue_num]
421
422
423 except IndexError:
424 this_residue = None
425
426 if this_residue and this_residue not in chars_to_ignore:
427 weight = record.annotations.get('weight', 1)
428 try:
429 score_dict[this_residue] += weight
430
431 except KeyError:
432 raise ValueError("Residue %s not found in alphabet %s"
433 % (this_residue,
434 self.alignment._alphabet))
435
436 pssm_info.append((left_seq[residue_num],
437 score_dict))
438
439
440 return PSSM(pssm_info)
441
443 """Create a zeroed dictionary with all of the specified letters.
444 """
445 base_info = {}
446 for letter in letters:
447 base_info[letter] = 0
448
449 return base_info
450
451 - def information_content(self, start = 0,
452 end = None,
453 e_freq_table = None, log_base = 2,
454 chars_to_ignore = []):
455 """Calculate the information content for each residue along an alignment.
456
457 Arguments:
458 o start, end - The starting an ending points to calculate the
459 information content. These points should be relative to the first
460 sequence in the alignment, starting at zero (ie. even if the 'real'
461 first position in the seq is 203 in the initial sequence, for
462 the info content, we need to use zero). This defaults to the entire
463 length of the first sequence.
464 o e_freq_table - A FreqTable object specifying the expected frequencies
465 for each letter in the alphabet we are using (ie. {'G' : 0.4,
466 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
467 included, since these should not have expected frequencies.
468 o log_base - The base of the logathrim to use in calculating the
469 information content. This defaults to 2 so the info is in bits.
470 o chars_to_ignore - A listing of characterw which should be ignored
471 in calculating the info content.
472
473 Returns:
474 o A number representing the info content for the specified region.
475
476 Please see the Biopython manual for more information on how information
477 content is calculated.
478 """
479
480 if end is None:
481 end = len(self.alignment._records[0].seq)
482
483 if start < 0 or end > len(self.alignment._records[0].seq):
484 raise ValueError \
485 ("Start (%s) and end (%s) are not in the range %s to %s"
486 % (start, end, 0, len(self.alignment._records[0].seq)))
487
488
489
490 random_expected = None
491 if not e_freq_table:
492 if isinstance(self.alignment._alphabet,
493 Alphabet.ProteinAlphabet):
494 random_expected = Protein20Random
495 elif isinstance(self.alignment._alphabet, Alphabet.RNAAlphabet) or \
496 isinstance(self.alignment._alphabet, Alphabet.DNAAlphabet):
497 random_expected = Nucleotide4Random
498
499 elif isinstance(self.alignment._alphabet, Alphabet.AlphabetEncoder):
500 if isinstance(self.alignment._alphabet.alphabet, Alphabet.ProteinAlphabet):
501 random_expected = Protein20Random
502 elif isinstance(self.alignment._alphabet.alphabet, Alphabet.RNAAlphabet) or \
503 isinstance(self.alignment._alphabet.alphabet, Alphabet.DNAAlphabet):
504 random_expected = Nucleotide4Random
505 if not random_expected :
506 errstr = "Error in alphabet: not Nucleotide or Protein, "
507 errstr += "supply expected frequencies"
508 raise ValueError, errstr
509
510
511 all_letters = self._get_all_letters()
512 for char in chars_to_ignore:
513 all_letters = string.replace(all_letters, char, '')
514
515 info_content = {}
516 for residue_num in range(start, end):
517 freq_dict = self._get_letter_freqs(residue_num,
518 self.alignment._records,
519 all_letters, chars_to_ignore)
520
521 column_score = self._get_column_info_content(freq_dict,
522 e_freq_table,
523 log_base,
524 random_expected)
525
526 info_content[residue_num] = column_score
527
528 total_info = 0
529 for column_info in info_content.values():
530 total_info = total_info + column_info
531
532 for i in info_content.keys():
533 self.ic_vector[i] = info_content[i]
534 return total_info
535
537 """Determine the frequency of specific letters in the alignment.
538
539 Arguments:
540 o residue_num - The number of the column we are getting frequencies
541 from.
542 o all_records - All of the SeqRecords in the alignment.
543 o letters - The letters we are interested in getting the frequency
544 for.
545 o to_ignore - Letters we are specifically supposed to ignore.
546
547 This will calculate the frequencies of each of the specified letters
548 in the alignment at the given frequency, and return this as a
549 dictionary where the keys are the letters and the values are the
550 frequencies.
551 """
552 freq_info = self._get_base_letters(letters)
553
554 total_count = 0
555
556 for record in all_records:
557 try:
558 if record.seq[residue_num] not in to_ignore:
559 weight = record.annotations.get('weight',1)
560 freq_info[record.seq[residue_num]] += weight
561 total_count += weight
562
563 except KeyError:
564 raise ValueError("Residue %s not found in alphabet %s"
565 % (record.seq[residue_num],
566 self.alignment._alphabet))
567
568
569 for letter in freq_info.keys():
570 freq_info[letter] = freq_info[letter] / total_count
571
572 return freq_info
573
574 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base,
575 random_expected):
576 """Calculate the information content for a column.
577
578 Arguments:
579 o obs_freq - The frequencies observed for each letter in the column.
580 o e_freq_table - An optional argument specifying the expected
581 frequencies for each letter. This is a SubsMat.FreqTable instance.
582 o log_base - The base of the logathrim to use in calculating the
583 info content.
584 """
585 if e_freq_table:
586
587 for key in obs_freq.keys():
588 if (key != self.alignment._alphabet.gap_char and
589 key not in e_freq_table):
590 raise ValueError("Expected frequency letters %s" +
591 " do not match observed %s"
592 % (e_freq_table.keys(), obs_freq.keys() -
593 [self.alignment._alphabet.gap_char]))
594
595 total_info = 0
596
597 for letter in obs_freq.keys():
598 inner_log = 0.
599
600
601
602 if letter != self.alignment._alphabet.gap_char:
603 if e_freq_table:
604 inner_log = obs_freq[letter] / e_freq_table[letter]
605 else:
606 inner_log = obs_freq[letter] / random_expected
607
608
609 if inner_log > 0:
610 letter_info = (obs_freq[letter] *
611 math.log(inner_log) / math.log(log_base))
612 total_info = total_info + letter_info
613 return total_info
614
617
619 """Represent a position specific score matrix.
620
621 This class is meant to make it easy to access the info within a PSSM
622 and also make it easy to print out the information in a nice table.
623
624 Let's say you had an alignment like this:
625 GTATC
626 AT--C
627 CTGTC
628
629 The position specific score matrix (when printed) looks like:
630
631 G A T C
632 G 1 1 0 1
633 T 0 0 3 0
634 A 1 1 0 0
635 T 0 0 2 0
636 C 0 0 0 3
637
638 You can access a single element of the PSSM using the following:
639
640 your_pssm[sequence_number][residue_count_name]
641
642 For instance, to get the 'T' residue for the second element in the
643 above alignment you would need to do:
644
645 your_pssm[1]['T']
646 """
648 """Initialize with pssm data to represent.
649
650 The pssm passed should be a list with the following structure:
651
652 list[0] - The letter of the residue being represented (for instance,
653 from the example above, the first few list[0]s would be GTAT...
654 list[1] - A dictionary with the letter substitutions and counts.
655 """
656 self.pssm = pssm
657
659 return self.pssm[pos][1]
660
662 out = " "
663 all_residues = self.pssm[0][1].keys()
664 all_residues.sort()
665
666
667 for res in all_residues:
668 out = out + " %s" % res
669 out = out + "\n"
670
671
672 for item in self.pssm:
673 out = out + "%s " % item[0]
674 for res in all_residues:
675 out = out + " %.1f" % item[1][res]
676
677 out = out + "\n"
678 return out
679
681 """Return the residue letter at the specified position.
682 """
683 return self.pssm[pos][0]
684
685
686 -def print_info_content(summary_info,fout=None,rep_record=0):
687 """ Three column output: position, aa in representative sequence,
688 ic_vector value"""
689 fout = fout or sys.stdout
690 if not summary_info.ic_vector:
691 summary_info.information_content()
692 rep_sequence = summary_info.alignment._records[rep_record].seq
693 positions = summary_info.ic_vector.keys()
694 positions.sort()
695 for pos in positions:
696 fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos],
697 summary_info.ic_vector[pos]))
698
699 if __name__ == "__main__" :
700 print "Quick test"
701 from Bio import AlignIO
702
703 filename = "../../Tests/GFF/multi.fna"
704 format = "fasta"
705
706
707
708
709 alignment = AlignIO.read(open(filename), format)
710 for record in alignment :
711 print record.seq.tostring()
712 print "="*alignment.get_alignment_length()
713
714 summary = SummaryInfo(alignment)
715 consensus = summary.dumb_consensus(ambiguous="N")
716 print consensus
717 consensus = summary.gap_consensus(ambiguous="N")
718 print consensus
719 print
720 print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
721 axis_seq=consensus)
722 print
723 print summary.information_content()
724 print
725 print "Done"
726