Package Bio :: Module SeqFeature
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqFeature

  1  """Represent a Sequence Feature holding info about a part of a sequence. 
  2   
  3  This is heavily modeled after the Biocorba SeqFeature objects, and 
  4  may be pretty biased towards GenBank stuff since I'm writing it 
  5  for the GenBank parser output... 
  6   
  7  What's here: 
  8   
  9  Base class to hold a Feature. 
 10  ---------------------------- 
 11  classes: 
 12  o SeqFeature 
 13   
 14  Hold information about a Reference. 
 15  ---------------------------------- 
 16   
 17  This is an attempt to create a General class to hold Reference type 
 18  information. 
 19   
 20  classes: 
 21  o Reference 
 22   
 23  Specify locations of a feature on a Sequence. 
 24  --------------------------------------------- 
 25   
 26  This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in 
 27  much the same way as Biocorba. This has the advantages of allowing us 
 28  to handle fuzzy stuff in case anyone needs it, and also be compatible 
 29  with Biocorba. 
 30   
 31  classes: 
 32  o FeatureLocation - Specify the start and end location of a feature. 
 33   
 34  o ExactPosition - Specify the position as being exact. 
 35  o WithinPosition - Specify a position occuring within some range. 
 36  o BetweenPosition - Specify a position occuring between a range. 
 37  o BeforePosition - Specify the position as being found before some base. 
 38  o AfterPosition - Specify the position as being found after some base. 
 39  """ 
 40   
41 -class SeqFeature(object):
42 """Represent a Sequence Feature on an object. 43 44 Attributes: 45 o location - the location of the feature on the sequence 46 o type - the specified type of the feature (ie. CDS, exon, repeat...) 47 o location_operator - a string specifying how this SeqFeature may 48 be related to others. For example, in the example GenBank feature 49 shown below, the location_operator would be "join" 50 o strand - A value specifying on which strand (of a DNA sequence, for 51 instance) the feature deals with. 1 indicates the plus strand, -1 52 indicates the minus strand, 0 indicates both strands, and None indicates 53 that strand doesn't apply (ie. for proteins) or is not known. 54 o id - A string identifier for the feature. 55 o ref - A reference to another sequence. This could be an accession 56 number for some different sequence. 57 o ref_db - A different database for the reference accession number. 58 o qualifiers - A dictionary of qualifiers on the feature. These are 59 analagous to the qualifiers from a GenBank feature table. The keys of 60 the dictionary are qualifier names, the values are the qualifier 61 values. 62 o sub_features - Additional SeqFeatures which fall under this 'parent' 63 feature. For instance, if we having something like: 64 65 CDS join(1..10,30..40,50..60) 66 67 The the top level feature would be a CDS from 1 to 60, and the sub 68 features would be of 'CDS_join' type and would be from 1 to 10, 30 to 69 40 and 50 to 60, respectively. 70 """
71 - def __init__(self, location = None, type = '', location_operator = '', 72 strand = None, id = "<unknown id>", 73 qualifiers = None, sub_features = None, 74 ref = None, ref_db = None):
75 """Initialize a SeqFeature on a Sequence. 76 """ 77 self.location = location 78 79 self.type = type 80 self.location_operator = location_operator 81 self.strand = strand 82 self.id = id 83 if qualifiers is None: 84 qualifiers = {} 85 self.qualifiers = qualifiers 86 if sub_features is None: 87 sub_features = [] 88 self.sub_features = sub_features 89 self.ref = ref 90 self.ref_db = ref_db
91
92 - def __repr__(self):
93 """A string representation of the record for debugging.""" 94 answer = "%s(%s" % (self.__class__.__name__, repr(self.location)) 95 if self.type : 96 answer += ", type=%s" % repr(self.type) 97 if self.location_operator : 98 answer += ", location_operator=%s" % repr(self.location_operator) 99 if self.strand : 100 answer += ", strand=%s" % repr(self.strand) 101 if self.id and self.id != "<unknown id>" : 102 answer += ", id=%s" % repr(self.id) 103 if self.ref : 104 answer += ", ref=%s" % repr(self.ref) 105 if self.ref_db : 106 answer += ", ref_db=%s" % repr(self.ref_db) 107 answer += ")" 108 return answer
109
110 - def __str__(self):
111 """A readable summary of the feature intended to be printed to screen. 112 """ 113 out = "type: %s\n" % self.type 114 out += "location: %s\n" % self.location 115 out += "ref: %s:%s\n" % (self.ref, self.ref_db) 116 out += "strand: %s\n" % self.strand 117 out += "qualifiers: \n" 118 qualifier_keys = self.qualifiers.keys() 119 qualifier_keys.sort() 120 for qual_key in qualifier_keys: 121 out += " Key: %s, Value: %s\n" % (qual_key, 122 self.qualifiers[qual_key]) 123 if len(self.sub_features) != 0: 124 out += "Sub-Features\n" 125 for sub_feature in self.sub_features: 126 out +="%s\n" % sub_feature 127 128 return out
129
130 - def _shift(self, offset) :
131 """Returns a copy of the feature with its location shifted (PRIVATE). 132 133 The annotation qaulifiers are copied.""" 134 answer = SeqFeature(location = self.location._shift(offset), 135 type = self.type, 136 location_operator = self.location_operator, 137 strand = self.strand, 138 id = self.id, 139 #qualifiers = dict(self.qualifiers.iteritems()), 140 #sub_features = [f._shift(offset) for f in self.sub_features], 141 ref = self.ref, 142 ref_db = self.ref_db) 143 #TODO - Sort out the use of sub_feature and qualifiers in __init___ 144 answer.sub_features = [f._shift(offset) for f in self.sub_features] 145 answer.qualifiers = dict(self.qualifiers.iteritems()) 146 return answer
147 148 # --- References 149 150 # TODO -- Will this hold PubMed and Medline information decently?
151 -class Reference(object):
152 """Represent a Generic Reference object. 153 154 Attributes: 155 o location - A list of Location objects specifying regions of 156 the sequence that the references correspond to. If no locations are 157 specified, the entire sequence is assumed. 158 o authors - A big old string, or a list split by author, of authors 159 for the reference. 160 o title - The title of the reference. 161 o journal - Journal the reference was published in. 162 o medline_id - A medline reference for the article. 163 o pubmed_id - A pubmed reference for the article. 164 o comment - A place to stick any comments about the reference. 165 """
166 - def __init__(self):
167 self.location = [] 168 self.authors = '' 169 self.consrtm = '' 170 self.title = '' 171 self.journal = '' 172 self.medline_id = '' 173 self.pubmed_id = '' 174 self.comment = ''
175
176 - def __str__(self):
177 """Output an informative string for debugging. 178 """ 179 out = "" 180 for single_location in self.location: 181 out += "location: %s\n" % single_location 182 out += "authors: %s\n" % self.authors 183 if self.consrtm: 184 out += "consrtm: %s\n" % self.consrtm 185 out += "title: %s\n" % self.title 186 out += "journal: %s\n" % self.journal 187 out += "medline id: %s\n" % self.medline_id 188 out += "pubmed id: %s\n" % self.pubmed_id 189 out += "comment: %s\n" % self.comment 190 return out
191
192 - def __repr__(self):
193 #TODO - Update this is __init__ later accpets values 194 return "%s(title=%s, ...)" % (self.__class__.__name__, 195 repr(self.title))
196 197 # --- Handling feature locations 198
199 -class FeatureLocation(object):
200 """Specify the location of a feature along a sequence. 201 202 This attempts to deal with fuzziness of position ends, but also 203 make it easy to get the start and end in the 'normal' case (no 204 fuzziness). 205 206 You should access the start and end attributes with 207 your_location.start and your_location.end. If the start and 208 end are exact, this will return the positions, if not, we'll return 209 the approriate Fuzzy class with info about the position and fuzziness. 210 211 Note that the start and end location numbering follow Python's scheme, 212 thus a GenBank entry of 123..150 (one based counting) becomes a location 213 of [122:150] (zero based counting). 214 """
215 - def __init__(self, start, end):
216 """Specify the start and end of a sequence feature. 217 218 start and end arguments specify the values where the feature begins 219 and ends. These can either by any of the *Position objects that 220 inherit from AbstractPosition, or can just be integers specifying the 221 position. In the case of integers, the values are assumed to be 222 exact and are converted in ExactPosition arguments. This is meant 223 to make it easy to deal with non-fuzzy ends. 224 """ 225 if isinstance(start, AbstractPosition): 226 self._start = start 227 else: 228 self._start = ExactPosition(start) 229 230 if isinstance(end, AbstractPosition): 231 self._end = end 232 else: 233 self._end = ExactPosition(end)
234
235 - def __str__(self):
236 """Returns a representation of the location (with python counting). 237 238 For the simple case this uses the python splicing syntax, [122:150] 239 (zero based counting) which GenBank would call 123..150 (one based 240 counting). 241 """ 242 return "[%s:%s]" % (self._start, self._end)
243
244 - def __repr__(self):
245 """A string representation of the location for debugging.""" 246 return "%s(%s,%s)" \ 247 % (self.__class__.__name__, repr(self.start), repr(self.end))
248
249 - def _shift(self, offset) :
250 """Returns a copy of the location shifted by the offset (PRIVATE).""" 251 return FeatureLocation(start = self._start._shift(offset), 252 end = self._end._shift(offset))
253 254 start = property(fget= lambda self : self._start, 255 doc="Start location (possibly a fuzzy position, read only).") 256 257 end = property(fget= lambda self : self._end, 258 doc="End location (possibly a fuzzy position, read only).") 259
260 - def _get_nofuzzy_start(self) :
261 #TODO - Do we still use the BetweenPosition class? 262 if ((self._start == self._end) and isinstance(self._start, 263 BetweenPosition)): 264 return self._start.position 265 else: 266 return min(self._start.position, 267 self._start.position + self._start.extension)
268 nofuzzy_start = property(fget=_get_nofuzzy_start, 269 doc="""Start position (integer, approximated if fuzzy, read only). 270 271 To get non-fuzzy attributes (ie. the position only) ask for 272 'location.nofuzzy_start', 'location.nofuzzy_end'. These should return 273 the largest range of the fuzzy position. So something like: 274 (10.20)..(30.40) should return 10 for start, and 40 for end. 275 """) 276
277 - def _get_nofuzzy_end(self) :
278 #TODO - Do we still use the BetweenPosition class? 279 if ((self._start == self._end) and isinstance(self._start, 280 BetweenPosition)): 281 return self._end.position 282 else: 283 return max(self._end.position, 284 self._end.position + self._end.extension)
285 nofuzzy_end = property(fget=_get_nofuzzy_end, 286 doc="""End position (integer, approximated if fuzzy, read only). 287 288 To get non-fuzzy attributes (ie. the position only) ask for 289 'location.nofuzzy_start', 'location.nofuzzy_end'. These should return 290 the largest range of the fuzzy position. So something like: 291 (10.20)..(30.40) should return 10 for start, and 40 for end. 292 """)
293
294 -class AbstractPosition(object):
295 """Abstract base class representing a position. 296 """
297 - def __init__(self, position, extension):
298 self.position = position 299 self.extension = extension
300
301 - def __repr__(self) :
302 """String representation of the location for debugging.""" 303 return "%s(%s,%s)" % (self.__class__.__name__, \ 304 repr(self.position), repr(self.extension))
305
306 - def __cmp__(self, other):
307 """A simple comparison function for positions. 308 309 This is very simple-minded and just compares the position attribute 310 of the features; extensions are not considered at all. This could 311 potentially be expanded to try to take advantage of extensions. 312 """ 313 assert isinstance(other, AbstractPosition), \ 314 "We can only do comparisons between Biopython Position objects." 315 316 return cmp(self.position, other.position)
317
318 - def _shift(self, offset) :
319 #We want this to maintain the subclass when called from a subclass 320 return self.__class__(self.position + offset, self.extension)
321
322 -class ExactPosition(AbstractPosition):
323 """Specify the specific position of a boundary. 324 325 o position - The position of the boundary. 326 o extension - An optional argument which must be zero since we don't 327 have an extension. The argument is provided so that the same number of 328 arguments can be passed to all position types. 329 330 In this case, there is no fuzziness associated with the position. 331 """
332 - def __init__(self, position, extension = 0):
333 if extension != 0: 334 raise AttributeError("Non-zero extension %s for exact position." 335 % extension) 336 AbstractPosition.__init__(self, position, 0)
337
338 - def __repr__(self) :
339 """String representation of the ExactPosition location for debugging.""" 340 assert self.extension == 0 341 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
342
343 - def __str__(self):
344 return str(self.position)
345
346 -class WithinPosition(AbstractPosition):
347 """Specify the position of a boundary within some coordinates. 348 349 Arguments: 350 o position - The start position of the boundary 351 o extension - The range to which the boundary can extend. 352 353 This allows dealing with a position like ((1.4)..100). This 354 indicates that the start of the sequence is somewhere between 1 355 and 4. To represent that with this class we would set position as 356 1 and extension as 3. 357 """
358 - def __init__(self, position, extension = 0):
359 AbstractPosition.__init__(self, position, extension)
360
361 - def __str__(self):
362 return "(%s.%s)" % (self.position, self.position + self.extension)
363
364 -class BetweenPosition(AbstractPosition):
365 """Specify the position of a boundary between two coordinates. 366 367 Arguments: 368 o position - The start position of the boundary. 369 o extension - The range to the other position of a boundary. 370 371 This specifies a coordinate which is found between the two positions. 372 So this allows us to deal with a position like ((1^2)..100). To 373 represent that with this class we set position as 1 and the 374 extension as 1. 375 """
376 - def __init__(self, position, extension = 0):
377 AbstractPosition.__init__(self, position, extension)
378
379 - def __str__(self):
380 return "(%s^%s)" % (self.position, self.position + self.extension)
381
382 -class BeforePosition(AbstractPosition):
383 """Specify a position where the actual location occurs before it. 384 385 Arguments: 386 o position - The upper boundary of where the location can occur. 387 o extension - An optional argument which must be zero since we don't 388 have an extension. The argument is provided so that the same number of 389 arguments can be passed to all position types. 390 391 This is used to specify positions like (<10..100) where the location 392 occurs somewhere before position 10. 393 """
394 - def __init__(self, position, extension = 0):
395 if extension != 0: 396 raise AttributeError("Non-zero extension %s for exact position." 397 % extension) 398 AbstractPosition.__init__(self, position, 0)
399
400 - def __repr__(self) :
401 """A string representation of the location for debugging.""" 402 assert self.extension == 0 403 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
404
405 - def __str__(self):
406 return "<%s" % self.position
407
408 -class AfterPosition(AbstractPosition):
409 """Specify a position where the actual location is found after it. 410 411 Arguments: 412 o position - The lower boundary of where the location can occur. 413 o extension - An optional argument which must be zero since we don't 414 have an extension. The argument is provided so that the same number of 415 arguments can be passed to all position types. 416 417 This is used to specify positions like (>10..100) where the location 418 occurs somewhere after position 10. 419 """
420 - def __init__(self, position, extension = 0):
421 if extension != 0: 422 raise AttributeError("Non-zero extension %s for exact position." 423 % extension) 424 AbstractPosition.__init__(self, position, 0)
425
426 - def __repr__(self) :
427 """A string representation of the location for debugging.""" 428 assert self.extension == 0 429 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
430
431 - def __str__(self):
432 return ">%s" % self.position
433
434 -class OneOfPosition(AbstractPosition):
435 """Specify a position where the location can be multiple positions. 436 437 This models the GenBank 'one-of(1888,1901)' function, and tries 438 to make this fit within the Biopython Position models. In our case 439 the position of the "one-of" is set as the lowest choice, and the 440 extension is the range to the highest choice. 441 """
442 - def __init__(self, position_list):
443 """Initialize with a set of posssible positions. 444 445 position_list is a list of AbstractPosition derived objects, 446 specifying possible locations. 447 """ 448 # unique attribute for this type of positions 449 self.position_choices = position_list 450 # find the smallest and largest position in the choices 451 smallest = None 452 largest = None 453 for position_choice in self.position_choices: 454 assert isinstance(position_choice, AbstractPosition), \ 455 "Expected position objects, got %r" % position_choice 456 if smallest is None and largest is None: 457 smallest = position_choice.position 458 largest = position_choice.position 459 elif position_choice.position > largest: 460 largest = position_choice.position 461 elif position_choice.position < smallest: 462 smallest = position_choice.position 463 # initialize with our definition of position and extension 464 AbstractPosition.__init__(self, smallest, largest - smallest)
465
466 - def __repr__(self) :
467 """String representation of the OneOfPosition location for debugging.""" 468 return "%s(%s)" % (self.__class__.__name__, \ 469 repr(self.position_choices))
470
471 - def __str__(self):
472 out = "one-of(" 473 for position in self.position_choices: 474 out += "%s," % position 475 # replace the last comma with the closing parenthesis 476 out = out[:-1] + ")" 477 return out
478
479 -class PositionGap(object):
480 """Simple class to hold information about a gap between positions. 481 """
482 - def __init__(self, gap_size):
483 """Intialize with a position object containing the gap information. 484 """ 485 self.gap_size = gap_size
486
487 - def __repr__(self) :
488 """A string representation of the position gap for debugging.""" 489 return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
490
491 - def __str__(self):
492 out = "gap(%s)" % self.gap_size 493 return out
494