1 """Represent a Sequence Feature holding info about a part of a sequence.
2
3 This is heavily modeled after the Biocorba SeqFeature objects, and
4 may be pretty biased towards GenBank stuff since I'm writing it
5 for the GenBank parser output...
6
7 What's here:
8
9 Base class to hold a Feature.
10 ----------------------------
11 classes:
12 o SeqFeature
13
14 Hold information about a Reference.
15 ----------------------------------
16
17 This is an attempt to create a General class to hold Reference type
18 information.
19
20 classes:
21 o Reference
22
23 Specify locations of a feature on a Sequence.
24 ---------------------------------------------
25
26 This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in
27 much the same way as Biocorba. This has the advantages of allowing us
28 to handle fuzzy stuff in case anyone needs it, and also be compatible
29 with Biocorba.
30
31 classes:
32 o FeatureLocation - Specify the start and end location of a feature.
33
34 o ExactPosition - Specify the position as being exact.
35 o WithinPosition - Specify a position occuring within some range.
36 o BetweenPosition - Specify a position occuring between a range.
37 o BeforePosition - Specify the position as being found before some base.
38 o AfterPosition - Specify the position as being found after some base.
39 """
40
42 """Represent a Sequence Feature on an object.
43
44 Attributes:
45 o location - the location of the feature on the sequence
46 o type - the specified type of the feature (ie. CDS, exon, repeat...)
47 o location_operator - a string specifying how this SeqFeature may
48 be related to others. For example, in the example GenBank feature
49 shown below, the location_operator would be "join"
50 o strand - A value specifying on which strand (of a DNA sequence, for
51 instance) the feature deals with. 1 indicates the plus strand, -1
52 indicates the minus strand, 0 indicates both strands, and None indicates
53 that strand doesn't apply (ie. for proteins) or is not known.
54 o id - A string identifier for the feature.
55 o ref - A reference to another sequence. This could be an accession
56 number for some different sequence.
57 o ref_db - A different database for the reference accession number.
58 o qualifier - A dictionary of qualifiers on the feature. These are
59 analagous to the qualifiers from a GenBank feature table. The keys of
60 the dictionary are qualifier names, the values are the qualifier
61 values.
62 o sub_features - Additional SeqFeatures which fall under this 'parent'
63 feature. For instance, if we having something like:
64
65 CDS join(1..10,30..40,50..60)
66
67 The the top level feature would be a CDS from 1 to 60, and the sub
68 features would be of 'CDS_join' type and would be from 1 to 10, 30 to
69 40 and 50 to 60, respectively.
70 """
71 - def __init__(self, location = None, type = '', location_operator = '',
72 strand = None, id = "<unknown id>",
73 qualifiers = {}, sub_features = [],
74 ref = None, ref_db = None):
75 """Initialize a SeqFeature on a Sequence.
76 """
77 self.location = location
78
79 self.type = type
80 self.location_operator = location_operator
81 self.strand = strand
82 self.id = id
83
84
85
86
87 self.qualifiers = {}
88 self.sub_features = []
89 self.ref = ref
90 self.ref_db = ref_db
91
93 """A string representation of the record for debugging."""
94 answer = "%s(%s" % (self.__class__, repr(self.location))
95 if self.type :
96 answer += ", type=%s" % repr(self.type)
97 if self.location_operator :
98 answer += ", location_operator=%s" % repr(self.location_operator)
99 if self.strand :
100 answer += ", strand=%s" % repr(self.strand)
101 if self.id and self.id != "<unknown id>" :
102 answer += ", id=%s" % repr(self.id)
103 if self.ref :
104 answer += ", ref=%s" % repr(self.ref)
105 if self.ref_db :
106 answer += ", ref_db=%s" % repr(self.ref_db)
107 answer += ")"
108 return answer
109
111 """A readable summary of the feature intended to be printed to screen.
112 """
113 out = "type: %s\n" % self.type
114 out += "location: %s\n" % self.location
115 out += "ref: %s:%s\n" % (self.ref, self.ref_db)
116 out += "strand: %s\n" % self.strand
117 out += "qualifiers: \n"
118 qualifier_keys = self.qualifiers.keys()
119 qualifier_keys.sort()
120 for qual_key in qualifier_keys:
121 out += "\tKey: %s, Value: %s\n" % (qual_key,
122 self.qualifiers[qual_key])
123 if len(self.sub_features) != 0:
124 out += "Sub-Features\n"
125 for sub_feature in self.sub_features:
126 out +="%s\n" % sub_feature
127
128 return out
129
131 """Returns a copy of the feature with its location shifted (PRIVATE).
132
133 The annotation qaulifiers are copied."""
134 answer = SeqFeature(location = self.location._shift(offset),
135 type = self.type,
136 location_operator = self.location_operator,
137 strand = self.strand,
138 id = self.id,
139
140
141 ref = self.ref,
142 ref_db = self.ref_db)
143
144 answer.sub_features = [f._shift(offset) for f in self.sub_features]
145 answer.qualifiers = dict(self.qualifiers.iteritems())
146 return answer
147
148
149
150
152 """Represent a Generic Reference object.
153
154 Attributes:
155 o location - A list of Location objects specifying regions of
156 the sequence that the references correspond to. If no locations are
157 specified, the entire sequence is assumed.
158 o authors - A big old string, or a list split by author, of authors
159 for the reference.
160 o title - The title of the reference.
161 o journal - Journal the reference was published in.
162 o medline_id - A medline reference for the article.
163 o pubmed_id - A pubmed reference for the article.
164 o comment - A place to stick any comments about the reference.
165 """
175
177 """Output an informative string for debugging.
178 """
179 out = ""
180 for single_location in self.location:
181 out += "location: %s\n" % single_location
182 out += "authors: %s\n" % self.authors
183 if self.consrtm:
184 out += "consrtm: %s\n" % self.consrtm
185 out += "title: %s\n" % self.title
186 out += "journal: %s\n" % self.journal
187 out += "medline id: %s\n" % self.medline_id
188 out += "pubmed id: %s\n" % self.pubmed_id
189 out += "comment: %s\n" % self.comment
190
191 return out
192
193
194
196 """Specify the location of a feature along a sequence.
197
198 This attempts to deal with fuzziness of position ends, but also
199 make it easy to get the start and end in the 'normal' case (no
200 fuzziness).
201
202 You should access the start and end attributes with
203 your_location.start and your_location.end. If the start and
204 end are exact, this will return the positions, if not, we'll return
205 the approriate Fuzzy class with info about the position and fuzziness.
206
207 Note that the start and end location numbering follow Python's scheme,
208 thus a GenBank entry of 123..150 (one based counting) becomes a location
209 of [122:150] (zero based counting).
210 """
212 """Specify the start and end of a sequence feature.
213
214 start and end arguments specify the values where the feature begins
215 and ends. These can either by any of the *Position objects that
216 inherit from AbstractPosition, or can just be integers specifying the
217 position. In the case of integers, the values are assumed to be
218 exact and are converted in ExactPosition arguments. This is meant
219 to make it easy to deal with non-fuzzy ends.
220 """
221 if isinstance(start, AbstractPosition):
222 self._start = start
223 else:
224 self._start = ExactPosition(start)
225
226 if isinstance(end, AbstractPosition):
227 self._end = end
228 else:
229 self._end = ExactPosition(end)
230
232 """Returns a representation of the location (with python counting).
233
234 For the simple case this uses the python splicing syntax, [122:150]
235 (zero based counting) which GenBank would call 123..150 (one based
236 counting).
237 """
238 return "[%s:%s]" % (self._start, self._end)
239
241 """A string representation of the location for debugging."""
242 return "%s(%s,%s)" \
243 % (self.__class__, repr(self.start), repr(self.end))
244
249
251 """Make it easy to get non-fuzzy starts and ends.
252
253 We override get_attribute here so that in non-fuzzy cases we
254 can just return the start and end position without any hassle.
255
256 To get fuzzy start and ends, just ask for item.start and
257 item.end. To get non-fuzzy attributes (ie. the position only)
258 ask for 'item.nofuzzy_start', 'item.nofuzzy_end'. These should return
259 the largest range of the fuzzy position. So something like:
260 (10.20)..(30.40) should return 10 for start, and 40 for end.
261
262 The special tricky case where is when we have a single between position
263 argument like 2^3 for the range. We want nofuzzy_start and nofuzzy_end
264 to give a reasonable approximation of what this really means, which
265 is an empty string -- so the same position for both. Doing a special
266 case here sucks, but there is really not a general rule you can apply
267 to this.
268 """
269
270
271 if attr == 'start':
272 return self._start
273 elif attr == 'end':
274 return self._end
275 elif attr == 'nofuzzy_start':
276 if ((self._start == self._end) and isinstance(self._start,
277 BetweenPosition)):
278 return self._start.position
279 else:
280 return min(self._start.position,
281 self._start.position + self._start.extension)
282 elif attr == 'nofuzzy_end':
283 if ((self._start == self._end) and isinstance(self._start,
284 BetweenPosition)):
285 return self._end.position
286 else:
287 return max(self._end.position,
288 self._end.position + self._end.extension)
289 else:
290 raise AttributeError("Cannot evaluate attribute %s." % attr)
291
293 """Abstract base class representing a position.
294 """
295 - def __init__(self, position, extension):
296 self.position = position
297 self.extension = extension
298
300 """String representation of the location for debugging."""
301 return "%s(%s,%s)" \
302 % (self.__class__, repr(self.position), repr(self.extension))
303
305 """A simple comparison function for positions.
306
307 This is very simple-minded and just compares the position attribute
308 of the features; extensions are not considered at all. This could
309 potentially be expanded to try to take advantage of extensions.
310 """
311 assert isinstance(other, AbstractPosition), \
312 "We can only do comparisons between Biopython Position objects."
313
314 return cmp(self.position, other.position)
315
317
318 return self.__class__(self.position + offset, self.extension)
319
321 """Specify the specific position of a boundary.
322
323 o position - The position of the boundary.
324 o extension - An optional argument which must be zero since we don't
325 have an extension. The argument is provided so that the same number of
326 arguments can be passed to all position types.
327
328 In this case, there is no fuzziness associated with the position.
329 """
330 - def __init__(self, position, extension = 0):
331 if extension != 0:
332 raise AttributeError("Non-zero extension %s for exact position."
333 % extension)
334 AbstractPosition.__init__(self, position, 0)
335
337 """String representation of the ExactPosition location for debugging."""
338 assert self.extension == 0
339 return "%s(%s)" % (self.__class__, repr(self.position))
340
342 return str(self.position)
343
345 """Specify the position of a boundary within some coordinates.
346
347 Arguments:
348 o position - The start position of the boundary
349 o extension - The range to which the boundary can extend.
350
351 This allows dealing with a position like ((1.4)..100). This
352 indicates that the start of the sequence is somewhere between 1
353 and 4. To represent that with this class we would set position as
354 1 and extension as 3.
355 """
356 - def __init__(self, position, extension = 0):
358
360 return "(%s.%s)" % (self.position, self.position + self.extension)
361
363 """Specify the position of a boundary between two coordinates.
364
365 Arguments:
366 o position - The start position of the boundary.
367 o extension - The range to the other position of a boundary.
368
369 This specifies a coordinate which is found between the two positions.
370 So this allows us to deal with a position like ((1^2)..100). To
371 represent that with this class we set position as 1 and the
372 extension as 1.
373 """
374 - def __init__(self, position, extension = 0):
376
378 return "(%s^%s)" % (self.position, self.position + self.extension)
379
381 """Specify a position where the actual location occurs before it.
382
383 Arguments:
384 o position - The upper boundary of where the location can occur.
385 o extension - An optional argument which must be zero since we don't
386 have an extension. The argument is provided so that the same number of
387 arguments can be passed to all position types.
388
389 This is used to specify positions like (<10..100) where the location
390 occurs somewhere before position 10.
391 """
392 - def __init__(self, position, extension = 0):
393 if extension != 0:
394 raise AttributeError("Non-zero extension %s for exact position."
395 % extension)
396 AbstractPosition.__init__(self, position, 0)
397
399 """A string representation of the location for debugging."""
400 assert self.extension == 0
401 return "%s(%s)" % (self.__class__, repr(self.position))
402
404 return "<%s" % self.position
405
407 """Specify a position where the actual location is found after it.
408
409 Arguments:
410 o position - The lower boundary of where the location can occur.
411 o extension - An optional argument which must be zero since we don't
412 have an extension. The argument is provided so that the same number of
413 arguments can be passed to all position types.
414
415 This is used to specify positions like (>10..100) where the location
416 occurs somewhere after position 10.
417 """
418 - def __init__(self, position, extension = 0):
419 if extension != 0:
420 raise AttributeError("Non-zero extension %s for exact position."
421 % extension)
422 AbstractPosition.__init__(self, position, 0)
423
425 """A string representation of the location for debugging."""
426 assert self.extension == 0
427 return "%s(%s)" % (self.__class__, repr(self.position))
428
430 return ">%s" % self.position
431
433 """Specify a position where the location can be multiple positions.
434
435 This models the GenBank 'one-of(1888,1901)' function, and tries
436 to make this fit within the Biopython Position models. In our case
437 the position of the "one-of" is set as the lowest choice, and the
438 extension is the range to the highest choice.
439 """
441 """Initialize with a set of posssible positions.
442
443 position_list is a list of AbstractPosition derived objects,
444 specifying possible locations.
445 """
446
447 self.position_choices = position_list
448
449 smallest = None
450 largest = None
451 for position_choice in self.position_choices:
452 assert isinstance(position_choice, AbstractPosition), \
453 "Expected position objects, got %r" % position_choice
454 if smallest is None and largest is None:
455 smallest = position_choice.position
456 largest = position_choice.position
457 elif position_choice.position > largest:
458 largest = position_choice.position
459 elif position_choice.position < smallest:
460 smallest = position_choice.position
461
462 AbstractPosition.__init__(self, smallest, largest - smallest)
463
465 """String representation of the OneOfPosition location for debugging."""
466 return "%s(%s)" % (self.__class__, repr(self.position_choices))
467
469 out = "one-of("
470 for position in self.position_choices:
471 out += "%s," % position
472
473 out = out[:-1] + ")"
474 return out
475
477 """Simple class to hold information about a gap between positions.
478 """
480 """Intialize with a position object containing the gap information.
481 """
482 self.gap_size = gap_size
483
485 """A string representation of the position gap for debugging."""
486 return "%s(%s)" % (self.__class__, repr(self.gap_size))
487
489 out = "gap(%s)" % self.gap_size
490 return out
491