Package Bio :: Package Align :: Module Generic
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Generic

  1  """ 
  2  Contains classes to deal with generic sequence alignment stuff not 
  3  specific to a particular program or format. 
  4   
  5  classes: 
  6  o Alignment 
  7  """ 
  8   
  9  # standard library 
 10  import string 
 11   
 12  # biopython 
 13  from Bio.Seq import Seq 
 14  from Bio.SeqRecord import SeqRecord 
 15  from Bio import Alphabet 
 16  from Bio.Alphabet import IUPAC 
 17   
18 -class Alignment:
19 """Represent a set of alignments. 20 21 This is a base class to represent alignments, which can be subclassed 22 to deal with an alignment in a specific format. 23 """
24 - def __init__(self, alphabet):
25 """Initialize a new Alignment object. 26 27 Arguments: 28 o alphabet - The alphabet to use for the sequence objects that are 29 created. This alphabet must be a gapped type. 30 """ 31 self._alphabet = alphabet 32 # hold everything at a list of seq record objects 33 self._records = []
34
35 - def _str_line(self, record) :
36 """Returns a truncated string representation of a SeqRecord. 37 38 This is a PRIVATE function used by the __str__ method. 39 """ 40 if len(record.seq) <= 50 : 41 return "%s %s" % (record.seq, record.id) 42 else : 43 return "%s...%s %s" \ 44 % (record.seq[:44], record.seq[-3:], record.id)
45
46 - def __str__(self) :
47 """Returns a multi-line string summary of the alignment. 48 49 This output is intended to be readable, but large alignments are 50 shown truncated. A maximum of 20 rows (sequences) and 50 columns 51 are shown, with the record identifiers. This should fit nicely on a 52 single screen. e.g. 53 54 DNAAlphabet() alignment with 3 rows and 14 columns 55 ACGATCAGCTAGCT Alpha 56 CCGATCAGCTAGCT Beta 57 ACGATGAGCTAGCT Gamma 58 """ 59 rows = len(self._records) 60 lines = ["%s alignment with %i rows and %i columns" \ 61 % (str(self._alphabet), rows, self.get_alignment_length())] 62 if rows <= 20 : 63 lines.extend([self._str_line(rec) for rec in self._records]) 64 else : 65 lines.extend([self._str_line(rec) for rec in self._records[:18]]) 66 lines.append("...") 67 lines.append(self._str_line(self._records[-1])) 68 return "\n".join(lines)
69
70 - def __repr__(self) :
71 """Returns a representation of the object for debugging. 72 73 The representation cannot be used with eval() to recreate the object, 74 which is usually possible with simple python ojects. For example: 75 76 <Bio.Align.Generic.Alignment instance (2 records of length 14, 77 SingleLetterAlphabet()) at a3c184c> 78 79 The hex string is the memory address of the object, see help(id). 80 This provides a simple way to visually distinguish alignments of 81 the same size. 82 """ 83 return "<%s instance (%i records of length %i, %s) at %x>" % \ 84 (self.__class__, len(self._records), 85 self.get_alignment_length(), repr(self._alphabet), id(self))
86 #This version is useful for doing eval(repr(alignment)), 87 #but it can be VERY long: 88 #return "%s(%s, %s)" \ 89 # % (self.__class__, repr(self._records), repr(self._alphabet)) 90
91 - def get_all_seqs(self):
92 """Return all of the sequences involved in the alignment. 93 94 The return value is a list of SeqRecord objects. 95 """ 96 return self._records
97
98 - def __iter__(self) :
99 """Iterate over alignment rows as SeqRecord objects 100 101 e.g. 102 103 for record in align : 104 print record.id 105 print record.seq 106 """ 107 return iter(self._records)
108
109 - def get_seq_by_num(self, number):
110 """Retrieve a sequence by row number. 111 112 Returns: 113 o A Seq object for the requested sequence. 114 115 Raises: 116 o IndexError - If the specified number is out of range. 117 """ 118 return self._records[number].seq
119
120 - def get_alignment_length(self):
121 """Return the maximum length of the alignment. 122 123 All objects in the alignment should (hopefully) have the same 124 length. This function will go through and find this length 125 by finding the maximum length of sequences in the alignment. 126 """ 127 max_length = 0 128 129 for record in self._records: 130 if len(record.seq) > max_length: 131 max_length = len(record.seq) 132 133 return max_length
134
135 - def add_sequence(self, descriptor, sequence, start = None, end = None, 136 weight = 1.0):
137 """Add a sequence to the alignment. 138 139 This doesn't do any kind of alignment, it just adds in the sequence 140 object, which is assumed to be prealigned with the existing 141 sequences. 142 143 Arguments: 144 o descriptor - The descriptive id of the sequence being added. 145 This will be used as the resulting SeqRecord's 146 .id property (and, for historical compatibility, 147 also the .description property) 148 o sequence - A string with sequence info. 149 o start - You can explicitly set the start point of the sequence. 150 This is useful (at least) for BLAST alignments, which can just 151 be partial alignments of sequences. 152 o end - Specify the end of the sequence, which is important 153 for the same reason as the start. 154 o weight - The weight to place on the sequence in the alignment. 155 By default, all sequences have the same weight. (0.0 => no weight, 156 1.0 => highest weight) 157 """ 158 new_seq = Seq(sequence, self._alphabet) 159 160 #We are now effectively using the SeqRecord's .id as 161 #the primary identifier (e.g. in Bio.SeqIO) so we should 162 #populate it with the descriptor. 163 #For backwards compatibility, also store this in the 164 #SeqRecord's description property. 165 new_record = SeqRecord(new_seq, 166 id = descriptor, 167 description = descriptor) 168 169 # hack! We really need to work out how to deal with annotations 170 # and features in biopython. Right now, I'll just use the 171 # generic annotations dictionary we've got to store the start 172 # and end, but we should think up something better. I don't know 173 # if I'm really a big fan of the LocatableSeq thing they've got 174 # in BioPerl, but I'm not positive what the best thing to do on 175 # this is... 176 if start: 177 new_record.annotations['start'] = start 178 if end: 179 new_record.annotations['end'] = end 180 181 # another hack to add weight information to the sequence 182 new_record.annotations['weight'] = weight 183 184 self._records.append(new_record)
185
186 - def get_column(self,col):
187 """Returns a string containing a given column.""" 188 col_str = '' 189 assert col >= 0 and col <= self.get_alignment_length() 190 for rec in self._records: 191 col_str += rec.seq[col] 192 return col_str
193 194 if __name__ == "__main__" : 195 print "Mini self test..." 196 197 raw_data = ["ACGATCAGCTAGCT", "CCGATCAGCTAGCT", "ACGATGAGCTAGCT"] 198 a = Alignment(Alphabet.generic_dna) 199 a.add_sequence("Alpha", raw_data[0], weight=2) 200 a.add_sequence("Beta", raw_data[1]) 201 a.add_sequence("Gamma", raw_data[2]) 202 203 print 204 print "str(a):" 205 print str(a) 206 print 207 print "repr(a):" 208 print repr(a) 209 print 210 211 #Iterating over the rows... 212 for rec in a : 213 assert isinstance(rec, SeqRecord) 214 for r,rec in enumerate(a) : 215 assert isinstance(rec, SeqRecord) 216 assert raw_data[r] == rec.seq.tostring() 217 if r==0 : assert rec.annotations['weight']==2 218 print "Alignment iteraction as SeqRecord OK" 219