Package Bio :: Module SeqFeature
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqFeature

  1  """Represent a Sequence Feature holding info about a part of a sequence. 
  2   
  3  This is heavily modeled after the Biocorba SeqFeature objects, and 
  4  may be pretty biased towards GenBank stuff since I'm writing it 
  5  for the GenBank parser output... 
  6   
  7  What's here: 
  8   
  9  Base class to hold a Feature. 
 10  ---------------------------- 
 11  classes: 
 12  o SeqFeature 
 13   
 14  Hold information about a Reference. 
 15  ---------------------------------- 
 16   
 17  This is an attempt to create a General class to hold Reference type 
 18  information. 
 19   
 20  classes: 
 21  o Reference 
 22   
 23  Specify locations of a feature on a Sequence. 
 24  --------------------------------------------- 
 25   
 26  This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in 
 27  much the same way as Biocorba. This has the advantages of allowing us 
 28  to handle fuzzy stuff in case anyone needs it, and also be compatible 
 29  with Biocorba. 
 30   
 31  classes: 
 32  o FeatureLocation - Specify the start and end location of a feature. 
 33   
 34  o ExactPosition - Specify the position as being exact. 
 35  o WithinPosition - Specify a position occuring within some range. 
 36  o BetweenPosition - Specify a position occuring between a range. 
 37  o BeforePosition - Specify the position as being found before some base. 
 38  o AfterPosition - Specify the position as being found after some base. 
 39  """ 
 40   
41 -class SeqFeature:
42 """Represent a Sequence Feature on an object. 43 44 Attributes: 45 o location - the location of the feature on the sequence 46 o type - the specified type of the feature (ie. CDS, exon, repeat...) 47 o location_operator - a string specifying how this SeqFeature may 48 be related to others. For example, in the example GenBank feature 49 shown below, the location_operator would be "join" 50 o strand - A value specifying on which strand (of a DNA sequence, for 51 instance) the feature deals with. 1 indicates the plus strand, -1 52 indicates the minus strand, 0 indicates both strands, and None indicates 53 that strand doesn't apply (ie. for proteins) or is not known. 54 o id - A string identifier for the feature. 55 o ref - A reference to another sequence. This could be an accession 56 number for some different sequence. 57 o ref_db - A different database for the reference accession number. 58 o qualifier - A dictionary of qualifiers on the feature. These are 59 analagous to the qualifiers from a GenBank feature table. The keys of 60 the dictionary are qualifier names, the values are the qualifier 61 values. 62 o sub_features - Additional SeqFeatures which fall under this 'parent' 63 feature. For instance, if we having something like: 64 65 CDS join(1..10,30..40,50..60) 66 67 The the top level feature would be a CDS from 1 to 60, and the sub 68 features would be of 'CDS_join' type and would be from 1 to 10, 30 to 69 40 and 50 to 60, respectively. 70 """
71 - def __init__(self, location = None, type = '', location_operator = '', 72 strand = None, id = "<unknown id>", 73 qualifiers = {}, sub_features = [], 74 ref = None, ref_db = None):
75 """Initialize a SeqFeature on a Sequence. 76 """ 77 self.location = location 78 79 self.type = type 80 self.location_operator = location_operator 81 self.strand = strand 82 self.id = id 83 # XXX right now sub_features and qualifiers cannot be set 84 # from the initializer because this causes all kinds 85 # of recursive import problems. I can't understand why this is 86 # at all :-< 87 self.qualifiers = {} 88 self.sub_features = [] 89 self.ref = ref 90 self.ref_db = ref_db
91
92 - def __repr__(self):
93 """A string representation of the record for debugging.""" 94 answer = "%s(%s" % (self.__class__, repr(self.location)) 95 if self.type : 96 answer += ", type=%s" % repr(self.type) 97 if self.location_operator : 98 answer += ", location_operator=%s" % repr(self.location_operator) 99 if self.strand : 100 answer += ", strand=%s" % repr(self.strand) 101 if self.id and self.id != "<unknown id>" : 102 answer += ", id=%s" % repr(self.id) 103 if self.ref : 104 answer += ", ref=%s" % repr(self.ref) 105 if self.ref_db : 106 answer += ", ref_db=%s" % repr(self.ref_db) 107 answer += ")" 108 return answer
109
110 - def __str__(self):
111 """A readable summary of the feature intended to be printed to screen. 112 """ 113 out = "type: %s\n" % self.type 114 out += "location: %s\n" % self.location 115 out += "ref: %s:%s\n" % (self.ref, self.ref_db) 116 out += "strand: %s\n" % self.strand 117 out += "qualifiers: \n" 118 qualifier_keys = self.qualifiers.keys() 119 qualifier_keys.sort() 120 for qual_key in qualifier_keys: 121 out += "\tKey: %s, Value: %s\n" % (qual_key, 122 self.qualifiers[qual_key]) 123 if len(self.sub_features) != 0: 124 out += "Sub-Features\n" 125 for sub_feature in self.sub_features: 126 out +="%s\n" % sub_feature 127 128 return out
129
130 - def _shift(self, offset) :
131 """Returns a copy of the feature with its location shifted (PRIVATE). 132 133 The annotation qaulifiers are copied.""" 134 answer = SeqFeature(location = self.location._shift(offset), 135 type = self.type, 136 location_operator = self.location_operator, 137 strand = self.strand, 138 id = self.id, 139 #qualifiers = dict(self.qualifiers.iteritems()), 140 #sub_features = [f._shift(offset) for f in self.sub_features], 141 ref = self.ref, 142 ref_db = self.ref_db) 143 #TODO - Sort out the use of sub_feature and qualifiers in __init___ 144 answer.sub_features = [f._shift(offset) for f in self.sub_features] 145 answer.qualifiers = dict(self.qualifiers.iteritems()) 146 return answer
147 148 # --- References 149 150 # TODO -- Will this hold PubMed and Medline information decently?
151 -class Reference:
152 """Represent a Generic Reference object. 153 154 Attributes: 155 o location - A list of Location objects specifying regions of 156 the sequence that the references correspond to. If no locations are 157 specified, the entire sequence is assumed. 158 o authors - A big old string, or a list split by author, of authors 159 for the reference. 160 o title - The title of the reference. 161 o journal - Journal the reference was published in. 162 o medline_id - A medline reference for the article. 163 o pubmed_id - A pubmed reference for the article. 164 o comment - A place to stick any comments about the reference. 165 """
166 - def __init__(self):
167 self.location = [] 168 self.authors = '' 169 self.consrtm = '' 170 self.title = '' 171 self.journal = '' 172 self.medline_id = '' 173 self.pubmed_id = '' 174 self.comment = ''
175
176 - def __str__(self):
177 """Output an informative string for debugging. 178 """ 179 out = "" 180 for single_location in self.location: 181 out += "location: %s\n" % single_location 182 out += "authors: %s\n" % self.authors 183 if self.consrtm: 184 out += "consrtm: %s\n" % self.consrtm 185 out += "title: %s\n" % self.title 186 out += "journal: %s\n" % self.journal 187 out += "medline id: %s\n" % self.medline_id 188 out += "pubmed id: %s\n" % self.pubmed_id 189 out += "comment: %s\n" % self.comment 190 191 return out
192 193 # --- Handling feature locations 194
195 -class FeatureLocation:
196 """Specify the location of a feature along a sequence. 197 198 This attempts to deal with fuzziness of position ends, but also 199 make it easy to get the start and end in the 'normal' case (no 200 fuzziness). 201 202 You should access the start and end attributes with 203 your_location.start and your_location.end. If the start and 204 end are exact, this will return the positions, if not, we'll return 205 the approriate Fuzzy class with info about the position and fuzziness. 206 207 Note that the start and end location numbering follow Python's scheme, 208 thus a GenBank entry of 123..150 (one based counting) becomes a location 209 of [122:150] (zero based counting). 210 """
211 - def __init__(self, start, end):
212 """Specify the start and end of a sequence feature. 213 214 start and end arguments specify the values where the feature begins 215 and ends. These can either by any of the *Position objects that 216 inherit from AbstractPosition, or can just be integers specifying the 217 position. In the case of integers, the values are assumed to be 218 exact and are converted in ExactPosition arguments. This is meant 219 to make it easy to deal with non-fuzzy ends. 220 """ 221 if isinstance(start, AbstractPosition): 222 self._start = start 223 else: 224 self._start = ExactPosition(start) 225 226 if isinstance(end, AbstractPosition): 227 self._end = end 228 else: 229 self._end = ExactPosition(end)
230
231 - def __str__(self):
232 """Returns a representation of the location (with python counting). 233 234 For the simple case this uses the python splicing syntax, [122:150] 235 (zero based counting) which GenBank would call 123..150 (one based 236 counting). 237 """ 238 return "[%s:%s]" % (self._start, self._end)
239
240 - def __repr__(self):
241 """A string representation of the location for debugging.""" 242 return "%s(%s,%s)" \ 243 % (self.__class__, repr(self.start), repr(self.end))
244
245 - def _shift(self, offset) :
246 """Returns a copy of the location shifted by the offset (PRIVATE).""" 247 return FeatureLocation(start = self._start._shift(offset), 248 end = self._end._shift(offset))
249
250 - def __getattr__(self, attr):
251 """Make it easy to get non-fuzzy starts and ends. 252 253 We override get_attribute here so that in non-fuzzy cases we 254 can just return the start and end position without any hassle. 255 256 To get fuzzy start and ends, just ask for item.start and 257 item.end. To get non-fuzzy attributes (ie. the position only) 258 ask for 'item.nofuzzy_start', 'item.nofuzzy_end'. These should return 259 the largest range of the fuzzy position. So something like: 260 (10.20)..(30.40) should return 10 for start, and 40 for end. 261 262 The special tricky case where is when we have a single between position 263 argument like 2^3 for the range. We want nofuzzy_start and nofuzzy_end 264 to give a reasonable approximation of what this really means, which 265 is an empty string -- so the same position for both. Doing a special 266 case here sucks, but there is really not a general rule you can apply 267 to this. 268 """ 269 #TODO - these are not currently implemented as properties, this means 270 #they do not show up via dir(...) 271 if attr == 'start': 272 return self._start 273 elif attr == 'end': 274 return self._end 275 elif attr == 'nofuzzy_start': 276 if ((self._start == self._end) and isinstance(self._start, 277 BetweenPosition)): 278 return self._start.position 279 else: 280 return min(self._start.position, 281 self._start.position + self._start.extension) 282 elif attr == 'nofuzzy_end': 283 if ((self._start == self._end) and isinstance(self._start, 284 BetweenPosition)): 285 return self._end.position 286 else: 287 return max(self._end.position, 288 self._end.position + self._end.extension) 289 else: 290 raise AttributeError("Cannot evaluate attribute %s." % attr)
291
292 -class AbstractPosition:
293 """Abstract base class representing a position. 294 """
295 - def __init__(self, position, extension):
296 self.position = position 297 self.extension = extension
298
299 - def __repr__(self) :
300 """String representation of the location for debugging.""" 301 return "%s(%s,%s)" \ 302 % (self.__class__, repr(self.position), repr(self.extension))
303
304 - def __cmp__(self, other):
305 """A simple comparison function for positions. 306 307 This is very simple-minded and just compares the position attribute 308 of the features; extensions are not considered at all. This could 309 potentially be expanded to try to take advantage of extensions. 310 """ 311 assert isinstance(other, AbstractPosition), \ 312 "We can only do comparisons between Biopython Position objects." 313 314 return cmp(self.position, other.position)
315
316 - def _shift(self, offset) :
317 #We want this to maintain the subclass when called from a subclass 318 return self.__class__(self.position + offset, self.extension)
319
320 -class ExactPosition(AbstractPosition):
321 """Specify the specific position of a boundary. 322 323 o position - The position of the boundary. 324 o extension - An optional argument which must be zero since we don't 325 have an extension. The argument is provided so that the same number of 326 arguments can be passed to all position types. 327 328 In this case, there is no fuzziness associated with the position. 329 """
330 - def __init__(self, position, extension = 0):
331 if extension != 0: 332 raise AttributeError("Non-zero extension %s for exact position." 333 % extension) 334 AbstractPosition.__init__(self, position, 0)
335
336 - def __repr__(self) :
337 """String representation of the ExactPosition location for debugging.""" 338 assert self.extension == 0 339 return "%s(%s)" % (self.__class__, repr(self.position))
340
341 - def __str__(self):
342 return str(self.position)
343
344 -class WithinPosition(AbstractPosition):
345 """Specify the position of a boundary within some coordinates. 346 347 Arguments: 348 o position - The start position of the boundary 349 o extension - The range to which the boundary can extend. 350 351 This allows dealing with a position like ((1.4)..100). This 352 indicates that the start of the sequence is somewhere between 1 353 and 4. To represent that with this class we would set position as 354 1 and extension as 3. 355 """
356 - def __init__(self, position, extension = 0):
357 AbstractPosition.__init__(self, position, extension)
358
359 - def __str__(self):
360 return "(%s.%s)" % (self.position, self.position + self.extension)
361
362 -class BetweenPosition(AbstractPosition):
363 """Specify the position of a boundary between two coordinates. 364 365 Arguments: 366 o position - The start position of the boundary. 367 o extension - The range to the other position of a boundary. 368 369 This specifies a coordinate which is found between the two positions. 370 So this allows us to deal with a position like ((1^2)..100). To 371 represent that with this class we set position as 1 and the 372 extension as 1. 373 """
374 - def __init__(self, position, extension = 0):
375 AbstractPosition.__init__(self, position, extension)
376
377 - def __str__(self):
378 return "(%s^%s)" % (self.position, self.position + self.extension)
379
380 -class BeforePosition(AbstractPosition):
381 """Specify a position where the actual location occurs before it. 382 383 Arguments: 384 o position - The upper boundary of where the location can occur. 385 o extension - An optional argument which must be zero since we don't 386 have an extension. The argument is provided so that the same number of 387 arguments can be passed to all position types. 388 389 This is used to specify positions like (<10..100) where the location 390 occurs somewhere before position 10. 391 """
392 - def __init__(self, position, extension = 0):
393 if extension != 0: 394 raise AttributeError("Non-zero extension %s for exact position." 395 % extension) 396 AbstractPosition.__init__(self, position, 0)
397
398 - def __repr__(self) :
399 """A string representation of the location for debugging.""" 400 assert self.extension == 0 401 return "%s(%s)" % (self.__class__, repr(self.position))
402
403 - def __str__(self):
404 return "<%s" % self.position
405
406 -class AfterPosition(AbstractPosition):
407 """Specify a position where the actual location is found after it. 408 409 Arguments: 410 o position - The lower boundary of where the location can occur. 411 o extension - An optional argument which must be zero since we don't 412 have an extension. The argument is provided so that the same number of 413 arguments can be passed to all position types. 414 415 This is used to specify positions like (>10..100) where the location 416 occurs somewhere after position 10. 417 """
418 - def __init__(self, position, extension = 0):
419 if extension != 0: 420 raise AttributeError("Non-zero extension %s for exact position." 421 % extension) 422 AbstractPosition.__init__(self, position, 0)
423
424 - def __repr__(self) :
425 """A string representation of the location for debugging.""" 426 assert self.extension == 0 427 return "%s(%s)" % (self.__class__, repr(self.position))
428
429 - def __str__(self):
430 return ">%s" % self.position
431
432 -class OneOfPosition(AbstractPosition):
433 """Specify a position where the location can be multiple positions. 434 435 This models the GenBank 'one-of(1888,1901)' function, and tries 436 to make this fit within the Biopython Position models. In our case 437 the position of the "one-of" is set as the lowest choice, and the 438 extension is the range to the highest choice. 439 """
440 - def __init__(self, position_list):
441 """Initialize with a set of posssible positions. 442 443 position_list is a list of AbstractPosition derived objects, 444 specifying possible locations. 445 """ 446 # unique attribute for this type of positions 447 self.position_choices = position_list 448 # find the smallest and largest position in the choices 449 smallest = None 450 largest = None 451 for position_choice in self.position_choices: 452 assert isinstance(position_choice, AbstractPosition), \ 453 "Expected position objects, got %r" % position_choice 454 if smallest is None and largest is None: 455 smallest = position_choice.position 456 largest = position_choice.position 457 elif position_choice.position > largest: 458 largest = position_choice.position 459 elif position_choice.position < smallest: 460 smallest = position_choice.position 461 # initialize with our definition of position and extension 462 AbstractPosition.__init__(self, smallest, largest - smallest)
463
464 - def __repr__(self) :
465 """String representation of the OneOfPosition location for debugging.""" 466 return "%s(%s)" % (self.__class__, repr(self.position_choices))
467
468 - def __str__(self):
469 out = "one-of(" 470 for position in self.position_choices: 471 out += "%s," % position 472 # replace the last comma with the closing parenthesis 473 out = out[:-1] + ")" 474 return out
475
476 -class PositionGap:
477 """Simple class to hold information about a gap between positions. 478 """
479 - def __init__(self, gap_size):
480 """Intialize with a position object containing the gap information. 481 """ 482 self.gap_size = gap_size
483
484 - def __repr__(self) :
485 """A string representation of the position gap for debugging.""" 486 return "%s(%s)" % (self.__class__, repr(self.gap_size))
487
488 - def __str__(self):
489 out = "gap(%s)" % self.gap_size 490 return out
491