Package Bio :: Module SeqFeature
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqFeature

  1  """Represent a Sequence Feature holding info about a part of a sequence. 
  2   
  3  This is heavily modeled after the Biocorba SeqFeature objects, and 
  4  may be pretty biased towards GenBank stuff since I'm writing it 
  5  for the GenBank parser output... 
  6   
  7  What's here: 
  8   
  9  Base class to hold a Feature. 
 10  ---------------------------- 
 11  classes: 
 12  o SeqFeature 
 13   
 14  Hold information about a Reference. 
 15  ---------------------------------- 
 16   
 17  This is an attempt to create a General class to hold Reference type 
 18  information. 
 19   
 20  classes: 
 21  o Reference 
 22   
 23  Specify locations of a feature on a Sequence. 
 24  --------------------------------------------- 
 25   
 26  This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in 
 27  much the same way as Biocorba. This has the advantages of allowing us 
 28  to handle fuzzy stuff in case anyone needs it, and also be compatible 
 29  with Biocorba. 
 30   
 31  classes: 
 32  o FeatureLocation - Specify the start and end location of a feature. 
 33   
 34  o ExactPosition - Specify the position as being exact. 
 35  o WithinPosition - Specify a position occuring within some range. 
 36  o BetweenPosition - Specify a position occuring between a range. 
 37  o BeforePosition - Specify the position as being found before some base. 
 38  o AfterPosition - Specify the position as being found after some base. 
 39  """ 
 40   
41 -class SeqFeature:
42 """Represent a Sequence Feature on an object. 43 44 Attributes: 45 o location - the location of the feature on the sequence 46 o type - the specified type of the feature (ie. CDS, exon, repeat...) 47 o location_operator - a string specifying how this SeqFeature may 48 be related to others. For example, in the example GenBank feature 49 shown below, the location_operator would be "join" 50 o strand - A value specifying on which strand (of a DNA sequence, for 51 instance) the feature deals with. 1 indicates the plus strand, -1 52 indicates the minus strand, 0 indicates both strands, and None indicates 53 that strand doesn't apply (ie. for proteins) or is not known. 54 o id - A string identifier for the feature. 55 o ref - A reference to another sequence. This could be an accession 56 number for some different sequence. 57 o ref_db - A different database for the reference accession number. 58 o qualifier - A dictionary of qualifiers on the feature. These are 59 analagous to the qualifiers from a GenBank feature table. The keys of 60 the dictionary are qualifier names, the values are the qualifier 61 values. 62 o sub_features - Additional SeqFeatures which fall under this 'parent' 63 feature. For instance, if we having something like: 64 65 CDS join(1..10,30..40,50..60) 66 67 The the top level feature would be a CDS from 1 to 60, and the sub 68 features would be of 'CDS_join' type and would be from 1 to 10, 30 to 69 40 and 50 to 60, respectively. 70 """
71 - def __init__(self, location = None, type = '', location_operator = '', 72 strand = None, id = "<unknown id>", 73 qualifiers = {}, sub_features = [], 74 ref = None, ref_db = None):
75 """Initialize a SeqFeature on a Sequence. 76 """ 77 self.location = location 78 79 self.type = type 80 self.location_operator = location_operator 81 self.strand = strand 82 self.id = id 83 # XXX right now sub_features and qualifiers cannot be set 84 # from the initializer because this causes all kinds 85 # of recursive import problems. I can't understand why this is 86 # at all :-< 87 self.qualifiers = {} 88 self.sub_features = [] 89 self.ref = ref 90 self.ref_db = ref_db
91
92 - def __str__(self):
93 """Make it easier to debug features. 94 """ 95 out = "type: %s\n" % self.type 96 out += "location: %s\n" % self.location 97 out += "ref: %s:%s\n" % (self.ref, self.ref_db) 98 out += "strand: %s\n" % self.strand 99 out += "qualifiers: \n" 100 qualifier_keys = self.qualifiers.keys() 101 qualifier_keys.sort() 102 for qual_key in qualifier_keys: 103 out += "\tKey: %s, Value: %s\n" % (qual_key, 104 self.qualifiers[qual_key]) 105 if len(self.sub_features) != 0: 106 out += "Sub-Features\n" 107 for sub_feature in self.sub_features: 108 out +="%s\n" % sub_feature 109 110 return out
111 112 # --- References 113 114 # TODO -- Will this hold PubMed and Medline information decently?
115 -class Reference:
116 """Represent a Generic Reference object. 117 118 Attributes: 119 o location - A list of Location objects specifying regions of 120 the sequence that the references correspond to. If no locations are 121 specified, the entire sequence is assumed. 122 o authors - A big old string, or a list split by author, of authors 123 for the reference. 124 o title - The title of the reference. 125 o journal - Journal the reference was published in. 126 o medline_id - A medline reference for the article. 127 o pubmed_id - A pubmed reference for the article. 128 o comment - A place to stick any comments about the reference. 129 """
130 - def __init__(self):
131 self.location = [] 132 self.authors = '' 133 self.consrtm = '' 134 self.title = '' 135 self.journal = '' 136 self.medline_id = '' 137 self.pubmed_id = '' 138 self.comment = ''
139
140 - def __str__(self):
141 """Output an informative string for debugging. 142 """ 143 out = "" 144 for single_location in self.location: 145 out += "location: %s\n" % single_location 146 out += "authors: %s\n" % self.authors 147 if self.consrtm: 148 out += "consrtm: %s\n" % self.consrtm 149 out += "title: %s\n" % self.title 150 out += "journal: %s\n" % self.journal 151 out += "medline id: %s\n" % self.medline_id 152 out += "pubmed id: %s\n" % self.pubmed_id 153 out += "comment: %s\n" % self.comment 154 155 return out
156 157 # --- Handling feature locations 158
159 -class FeatureLocation:
160 """Specify the location of a feature along a sequence. 161 162 This attempts to deal with fuzziness of position ends, but also 163 make it easy to get the start and end in the 'normal' case (no 164 fuzziness). 165 166 You should access the start and end attributes with 167 your_location.start and your_location.end. If the start and 168 end are exact, this will return the positions, if not, we'll return 169 the approriate Fuzzy class with info about the position and fuzziness. 170 171 Note that the start and end location numbering follow Python's scheme, 172 thus a GenBank entry of 123..150 (one based counting) becomes a location 173 of [122:150] (zero based counting). 174 """
175 - def __init__(self, start, end):
176 """Specify the start and end of a sequence feature. 177 178 start and end arguments specify the values where the feature begins 179 and ends. These can either by any of the *Position objects that 180 inherit from AbstractPosition, or can just be integers specifying the 181 position. In the case of integers, the values are assumed to be 182 exact and are converted in ExactPosition arguments. This is meant 183 to make it easy to deal with non-fuzzy ends. 184 """ 185 if isinstance(start, AbstractPosition): 186 self._start = start 187 else: 188 self._start = ExactPosition(start) 189 190 if isinstance(end, AbstractPosition): 191 self._end = end 192 else: 193 self._end = ExactPosition(end)
194
195 - def __str__(self):
196 """Returns a representation of the location. For the simple case this 197 uses the python splicing syntax, [122:150] (zero based counting) which 198 GenBank would call 123..150 (one based counting). 199 """ 200 return "[%s:%s]" % (self._start, self._end)
201
202 - def __getattr__(self, attr):
203 """Make it easy to get non-fuzzy starts and ends. 204 205 We override get_attribute here so that in non-fuzzy cases we 206 can just return the start and end position without any hassle. 207 208 To get fuzzy start and ends, just ask for item.start and 209 item.end. To get non-fuzzy attributes (ie. the position only) 210 ask for 'item.nofuzzy_start', 'item.nofuzzy_end'. These should return 211 the largest range of the fuzzy position. So something like: 212 (10.20)..(30.40) should return 10 for start, and 40 for end. 213 """ 214 if attr == 'start': 215 return self._start 216 elif attr == 'end': 217 return self._end 218 elif attr == 'nofuzzy_start': 219 return min(self._start.position, 220 self._start.position + self._start.extension) 221 elif attr == 'nofuzzy_end': 222 return max(self._end.position, 223 self._end.position + self._end.extension) 224 else: 225 raise AttributeError("Cannot evaluate attribute %s." % attr)
226
227 -class AbstractPosition:
228 """Abstract base class representing a position. 229 """
230 - def __init__(self, position, extension):
231 self.position = position 232 self.extension = extension
233
234 - def __cmp__(self, other):
235 """A simple comparison function for positions. 236 237 This is very simple-minded and just compares the position attribute 238 of the features; extensions are not considered at all. This could 239 potentially be expanded to try to take advantage of extensions. 240 """ 241 assert isinstance(other, AbstractPosition), \ 242 "We can only do comparisons between Biopython Position objects." 243 244 return cmp(self.position, other.position)
245
246 -class ExactPosition(AbstractPosition):
247 """Specify the specific position of a boundary. 248 249 o position - The position of the boundary. 250 o extension - An optional argument which must be zero since we don't 251 have an extension. The argument is provided so that the same number of 252 arguments can be passed to all position types. 253 254 In this case, there is no fuzziness associated with the position. 255 """
256 - def __init__(self, position, extension = 0):
257 if extension != 0: 258 raise AttributeError("Non-zero extension %s for exact position." 259 % extension) 260 AbstractPosition.__init__(self, position, 0)
261
262 - def __str__(self):
263 return str(self.position)
264
265 -class WithinPosition(AbstractPosition):
266 """Specify the position of a boundary within some coordinates. 267 268 Arguments: 269 o position - The start position of the boundary 270 o extension - The range to which the boundary can extend. 271 272 This allows dealing with a position like ((1.4)..100). This 273 indicates that the start of the sequence is somewhere between 1 274 and 4. To represent that with this class we would set position as 275 1 and extension as 3. 276 """
277 - def __init__(self, position, extension = 0):
278 AbstractPosition.__init__(self, position, extension)
279
280 - def __str__(self):
281 return "(%s.%s)" % (self.position, self.position + self.extension)
282
283 -class BetweenPosition(AbstractPosition):
284 """Specify the position of a boundary between two coordinates. 285 286 Arguments: 287 o position - The start position of the boundary. 288 o extension - The range to the other position of a boundary. 289 290 This specifies a coordinate which is found between the two positions. 291 So this allows us to deal with a position like ((1^2)..100). To 292 represent that with this class we set position as 1 and the 293 extension as 1. 294 """
295 - def __init__(self, position, extension = 0):
296 AbstractPosition.__init__(self, position, extension)
297
298 - def __str__(self):
299 return "(%s^%s)" % (self.position, self.position + self.extension)
300
301 -class BeforePosition(AbstractPosition):
302 """Specify a position where the actual location occurs before it. 303 304 Arguments: 305 o position - The upper boundary of where the location can occur. 306 o extension - An optional argument which must be zero since we don't 307 have an extension. The argument is provided so that the same number of 308 arguments can be passed to all position types. 309 310 This is used to specify positions like (<10..100) where the location 311 occurs somewhere before position 10. 312 """
313 - def __init__(self, position, extension = 0):
314 if extension != 0: 315 raise AttributeError("Non-zero extension %s for exact position." 316 % extension) 317 AbstractPosition.__init__(self, position, 0)
318
319 - def __str__(self):
320 return "<%s" % self.position
321
322 -class AfterPosition(AbstractPosition):
323 """Specify a position where the actual location is found after it. 324 325 Arguments: 326 o position - The lower boundary of where the location can occur. 327 o extension - An optional argument which must be zero since we don't 328 have an extension. The argument is provided so that the same number of 329 arguments can be passed to all position types. 330 331 This is used to specify positions like (>10..100) where the location 332 occurs somewhere after position 10. 333 """
334 - def __init__(self, position, extension = 0):
335 if extension != 0: 336 raise AttributeError("Non-zero extension %s for exact position." 337 % extension) 338 AbstractPosition.__init__(self, position, 0)
339
340 - def __str__(self):
341 return ">%s" % self.position
342
343 -class OneOfPosition(AbstractPosition):
344 """Specify a position where the location can be multiple positions. 345 346 This models the GenBank 'one-of(1888,1901)' function, and tries 347 to make this fit within the Biopython Position models. In our case 348 the position of the "one-of" is set as the lowest choice, and the 349 extension is the range to the highest choice. 350 """
351 - def __init__(self, position_list):
352 """Initialie with a set of posssible positions. 353 354 position_list is a list of AbstractPosition derived objects, 355 specifying possible locations. 356 """ 357 # unique attribute for this type of positions 358 self.position_choices = position_list 359 # find the smallest and largest position in the choices 360 smallest = None 361 largest = None 362 for position_choice in self.position_choices: 363 assert isinstance(position_choice, AbstractPosition), \ 364 "Expected position objects, got %r" % position_choice 365 if smallest is None and largest is None: 366 smallest = position_choice.position 367 largest = position_choice.position 368 elif position_choice.position > largest: 369 largest = position_choice.position 370 elif position_choice.position < smallest: 371 smallest = position_choice.position 372 # initialize with our definition of position and extension 373 AbstractPosition.__init__(self, smallest, largest - smallest)
374
375 - def __str__(self):
376 out = "one-of(" 377 for position in self.position_choices: 378 out += "%s," % position 379 # replace the last comma with the closing parenthesis 380 out = out[:-1] + ")" 381 return out
382
383 -class PositionGap:
384 """Simple class to hold information about a gap between positions. 385 """
386 - def __init__(self, gap_size):
387 """Intialize with a position object containing the gap information. 388 """ 389 self.gap_size = gap_size
390
391 - def __str__(self):
392 out = "gap(%s)" % self.gap_size 393 return out
394