1
2
3
4
5
6
7
8 """Represent a Sequence Record, a sequence with annotation."""
9 __docformat__ = "epytext en"
10
11
12
13
14
15
17 """Dict which only allows sequences of given length as values (PRIVATE).
18
19 This simple subclass of the python dictionary is used in the SeqRecord
20 object for holding per-letter-annotations. This class is intended to
21 prevent simple errors by only allowing python sequences (e.g. lists,
22 strings and tuples) to be stored, and only if their length matches that
23 expected (the length of the SeqRecord's seq object). It cannot however
24 prevent the entries being edited in situ (for example appending entries
25 to a list).
26 """
32 if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \
33 or len(value) != self._length :
34 raise TypeError("We only allow python sequences (lists, tuples or "
35 "strings) of length %i." % self._length)
36 dict.__setitem__(self, key, value)
37
39 """A SeqRecord object holds a sequence and information about it.
40
41 Main attributes:
42 - id - Identifier such as a locus tag (string)
43 - seq - The sequence itself (Seq object)
44
45 Additional attributes:
46 - name - Sequence name, e.g. gene name (string)
47 - description - Additional text (string)
48 - dbxrefs - List of database cross references (list of strings)
49 - features - Any (sub)features defined (list of SeqFeature objects)
50 - annotations - Further information about the whole sequence (dictionary)
51 Most entries are lists of strings.
52 - letter_annotations - Per letter/symbol annotation (restricted
53 dictionary). This holds python sequences (lists, strings
54 or tuples) whose length matches that of the sequence.
55 A typical use would be to hold a list of integers
56 representing sequencing quality scores, or a string
57 representing the secondary structure.
58
59 You will typically use Bio.SeqIO to read in sequences from files as
60 SeqRecord objects. However, you may want to create your own SeqRecord
61 objects directly (see the __init__ method for further details):
62
63 >>> from Bio.Seq import Seq
64 >>> from Bio.SeqRecord import SeqRecord
65 >>> from Bio.Alphabet import IUPAC
66 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
67 ... IUPAC.protein),
68 ... id="YP_025292.1", name="HokC",
69 ... description="toxic membrane protein")
70 >>> print record
71 ID: YP_025292.1
72 Name: HokC
73 Description: toxic membrane protein
74 Number of features: 0
75 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
76
77 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
78 for this. For the special case where you want the SeqRecord turned into
79 a string in a particular file format there is a format method which uses
80 Bio.SeqIO internally:
81
82 >>> print record.format("fasta")
83 >YP_025292.1 toxic membrane protein
84 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
85 <BLANKLINE>
86 """
87 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>",
88 description = "<unknown description>", dbxrefs = None,
89 features = None):
90 """Create a SeqRecord.
91
92 Arguments:
93 - seq - Sequence, required (Seq or Mutable object)
94 - id - Sequence identifier, recommended (string)
95 - name - Sequence name, optional (string)
96 - description - Sequence description, optional (string)
97 - dbxrefs - Database cross references, optional (list of strings)
98 - features - Any (sub)features, optional (list of SeqFeature objects)
99
100 You will typically use Bio.SeqIO to read in sequences from files as
101 SeqRecord objects. However, you may want to create your own SeqRecord
102 objects directly.
103
104 Note that while an id is optional, we strongly recommend you supply a
105 unique id string for each record. This is especially important
106 if you wish to write your sequences to a file.
107
108 If you don't have the actual sequence, but you do know its length,
109 then using the UnknownSeq object from Bio.Seq is appropriate.
110
111 You can create a 'blank' SeqRecord object, and then populate the
112 attributes later. Note that currently the annotations and the
113 letter_annotations dictionaries cannot be specified when creating
114 the SeqRecord.
115 """
116 if id is not None and not isinstance(id, basestring) :
117
118 raise TypeError("id argument should be a string")
119 if not isinstance(name, basestring) :
120 raise TypeError("name argument should be a string")
121 if not isinstance(description, basestring) :
122 raise TypeError("description argument should be a string")
123 if dbxrefs is not None and not isinstance(dbxrefs, list) :
124 raise TypeError("dbxrefs argument should be a list (of strings)")
125 if features is not None and not isinstance(features, list) :
126 raise TypeError("features argument should be a list (of SeqFeature objects)")
127 self._seq = seq
128 self.id = id
129 self.name = name
130 self.description = description
131 if dbxrefs is None:
132 dbxrefs = []
133 self.dbxrefs = dbxrefs
134
135 self.annotations = {}
136
137
138 if seq is None :
139
140 self._per_letter_annotations = _RestrictedDict(length=0)
141 else :
142 try :
143 self._per_letter_annotations = _RestrictedDict(length=len(seq))
144 except :
145 raise TypeError("seq argument should be a Seq or MutableSeq")
146
147
148 if features is None:
149 features = []
150 self.features = features
151
152
154 if not isinstance(value, dict) :
155 raise TypeError("The per-letter-annotations should be a "
156 "(restricted) dictionary.")
157
158 try :
159 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
160 except AttributeError :
161
162 self._per_letter_annotations = _RestrictedDict(length=0)
163 self._per_letter_annotations.update(value)
164 letter_annotations = property( \
165 fget=lambda self : self._per_letter_annotations,
166 fset=_set_per_letter_annotations,
167 doc="""Dictionary of per-letter-annotation for the sequence.
168
169 For example, this can hold quality scores used in FASTQ or QUAL files.
170 Consider this example using Bio.SeqIO to read in an example Solexa
171 variant FASTQ file as a SeqRecord:
172
173 >>> from Bio import SeqIO
174 >>> handle = open("Quality/solexa.fastq", "rU")
175 >>> record = SeqIO.read(handle, "fastq-solexa")
176 >>> handle.close()
177 >>> print record.id, record.seq
178 slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC
179 >>> print record.letter_annotations.keys()
180 ['solexa_quality']
181 >>> print record.letter_annotations["solexa_quality"]
182 [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -6, -1, -1, -4, -1, -4, -19, -10, -27, -18]
183
184 The per-letter-annotaions get sliced automatically if you slice the
185 parent SeqRecord, for example taking the last ten bases:
186
187 >>> sub_record = record[-10:]
188 >>> print sub_record.id, sub_record.seq
189 slxa_0013_1_0001_24 CTTATACACC
190 >>> print sub_record.letter_annotations["solexa_quality"]
191 [-6, -1, -1, -4, -1, -4, -19, -10, -27, -18]
192
193 Any python sequence (i.e. list, tuple or string) can be recorded in
194 the SeqRecord's letter_annotations dictionary as long as the length
195 matches that of the SeqRecord's sequence. e.g.
196
197 >>> len(sub_record.letter_annotations)
198 1
199 >>> sub_record.letter_annotations["dummy"] = "abcdefghij"
200 >>> len(sub_record.letter_annotations)
201 2
202
203 You can delete entries from the letter_annotations dictionary as usual:
204
205 >>> del sub_record.letter_annotations["solexa_quality"]
206 >>> sub_record.letter_annotations
207 {'dummy': 'abcdefghij'}
208
209 You can completely clear the dictionary easily as follows:
210
211 >>> sub_record.letter_annotations = {}
212 >>> sub_record.letter_annotations
213 {}
214 """)
215
217
218 if self._per_letter_annotations :
219
220 raise ValueError("You must empty the letter annotations first!")
221 self._seq = value
222 try :
223 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
224 except AttributeError :
225
226 self._per_letter_annotations = _RestrictedDict(length=0)
227
228 seq = property(fget=lambda self : self._seq,
229 fset=_set_seq,
230 doc="The sequence itself, as a Seq or MutableSeq object.")
231
233 """Returns a sub-sequence or an individual letter.
234
235 Splicing, e.g. my_record[5:10], returns a new SeqRecord for
236 that sub-sequence with approriate annotation preserved. The
237 name, id and description are kept.
238
239 Any per-letter-annotations are sliced to match the requested
240 sub-sequence. Unless a stride is used, all those features
241 which fall fully within the subsequence are included (with
242 their locations adjusted accordingly).
243
244 However, the annotations dictionary and the dbxrefs list are
245 not used for the new SeqRecord, as in general they may not
246 apply to the subsequence. If you want to preserve them, you
247 must explictly copy them to the new SeqRecord yourself.
248
249 Using an integer index, e.g. my_record[5] is shorthand for
250 extracting that letter from the sequence, my_record.seq[5].
251
252 For example, consider this short protein and its secondary
253 structure as encoded by the PDB (e.g. H for alpha helices),
254 plus a simple feature for its histidine self phosphorylation
255 site:
256
257 >>> from Bio.Seq import Seq
258 >>> from Bio.SeqRecord import SeqRecord
259 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
260 >>> from Bio.Alphabet import IUPAC
261 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT"
262 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR",
263 ... IUPAC.protein),
264 ... id="1JOY", name="EnvZ",
265 ... description="Homodimeric domain of EnvZ from E. coli")
266 >>> rec.letter_annotations["secondary_structure"] = \
267 " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT "
268 >>> rec.features.append(SeqFeature(FeatureLocation(20,21),
269 ... type = "Site"))
270
271 Now let's have a quick look at the full record,
272
273 >>> print rec
274 ID: 1JOY
275 Name: EnvZ
276 Description: Homodimeric domain of EnvZ from E. coli
277 Number of features: 1
278 Per letter annotation for: secondary_structure
279 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
280 >>> print rec.letter_annotations["secondary_structure"]
281 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT
282 >>> print rec.features[0].location
283 [20:21]
284
285 Now let's take a sub sequence, here chosen as the first (fractured)
286 alpha helix which includes the histidine phosphorylation site:
287
288 >>> sub = rec[11:41]
289 >>> print sub
290 ID: 1JOY
291 Name: EnvZ
292 Description: Homodimeric domain of EnvZ from E. coli
293 Number of features: 1
294 Per letter annotation for: secondary_structure
295 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein())
296 >>> print sub.letter_annotations["secondary_structure"]
297 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH
298 >>> print sub.features[0].location
299 [9:10]
300
301 You can also of course omit the start or end values, for
302 example to get the first ten letters only:
303
304 >>> print rec[:10]
305 ID: 1JOY
306 Name: EnvZ
307 Description: Homodimeric domain of EnvZ from E. coli
308 Number of features: 0
309 Per letter annotation for: secondary_structure
310 Seq('MAAGVKQLAD', IUPACProtein())
311
312 Or for the last ten letters:
313
314 >>> print rec[-10:]
315 ID: 1JOY
316 Name: EnvZ
317 Description: Homodimeric domain of EnvZ from E. coli
318 Number of features: 0
319 Per letter annotation for: secondary_structure
320 Seq('IIEQFIDYLR', IUPACProtein())
321
322 If you omit both, then you get a copy of the original record (although
323 lacking the annotations and dbxrefs):
324
325 >>> print rec[:]
326 ID: 1JOY
327 Name: EnvZ
328 Description: Homodimeric domain of EnvZ from E. coli
329 Number of features: 1
330 Per letter annotation for: secondary_structure
331 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
332
333 Finally, indexing with a simple integer is shorthand for pulling out
334 that letter from the sequence directly:
335
336 >>> rec[5]
337 'K'
338 >>> rec.seq[5]
339 'K'
340 """
341 if isinstance(index, int) :
342
343
344
345 return self.seq[index]
346 elif isinstance(index, slice) :
347 if self.seq is None :
348 raise ValueError("If the sequence is None, we cannot slice it.")
349 parent_length = len(self)
350 answer = self.__class__(self.seq[index],
351 id=self.id,
352 name=self.name,
353 description=self.description)
354
355
356
357
358
359
360
361
362
363
364 if index.step is None or index.step == 1 :
365
366 if index.start is None :
367 start = 0
368 else :
369 start = index.start
370 if index.stop is None :
371 stop = -1
372 else :
373 stop = index.stop
374 if (start < 0 or stop < 0) and parent_length == 0 :
375 raise ValueError, \
376 "Cannot support negative indices without the sequence length"
377 if start < 0 :
378 start = parent_length - start
379 if stop < 0 :
380 stop = parent_length - stop + 1
381
382 for f in self.features :
383 if start <= f.location.start.position \
384 and f.location.end.position < stop :
385 answer.features.append(f._shift(-start))
386
387
388
389 for key, value in self.letter_annotations.iteritems() :
390 answer._per_letter_annotations[key] = value[index]
391
392 return answer
393 raise ValueError, "Invalid index"
394
396 """Iterate over the letters in the sequence.
397
398 For example, using Bio.SeqIO to read in a protein FASTA file:
399
400 >>> from Bio import SeqIO
401 >>> record = SeqIO.read(open("Amino/loveliesbleeding.pro"),"fasta")
402 >>> for amino in record :
403 ... print amino
404 ... if amino == "L" : break
405 X
406 A
407 G
408 L
409 >>> print record.seq[3]
410 L
411
412 This is just a shortcut for iterating over the sequence directly:
413
414 >>> for amino in record.seq :
415 ... print amino
416 ... if amino == "L" : break
417 X
418 A
419 G
420 L
421 >>> print record.seq[3]
422 L
423
424 Note that this does not facilitate iteration together with any
425 per-letter-annotation. However, you can achieve that using the
426 python zip function on the record (or its sequence) and the relevant
427 per-letter-annotation:
428
429 >>> from Bio import SeqIO
430 >>> rec = SeqIO.read(open("Quality/solexa.fastq", "rU"),
431 ... "fastq-solexa")
432 >>> print rec.id, rec.seq
433 slxa_0013_1_0001_24 ACAAAAATCACAAGCATTCTTATACACC
434 >>> print rec.letter_annotations.keys()
435 ['solexa_quality']
436 >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]) :
437 ... if qual < -10 :
438 ... print nuc, qual
439 C -19
440 C -27
441 C -18
442
443 You may agree that using zip(rec.seq, ...) is more explicit than using
444 zip(rec, ...) as shown above.
445 """
446 return iter(self.seq)
447
449 """A human readable summary of the record and its annotation (string).
450
451 The python built in function str works by calling the object's ___str__
452 method. e.g.
453
454 >>> from Bio.Seq import Seq
455 >>> from Bio.SeqRecord import SeqRecord
456 >>> from Bio.Alphabet import IUPAC
457 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
458 ... IUPAC.protein),
459 ... id="YP_025292.1", name="HokC",
460 ... description="toxic membrane protein, small")
461 >>> print str(record)
462 ID: YP_025292.1
463 Name: HokC
464 Description: toxic membrane protein, small
465 Number of features: 0
466 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
467
468 In this example you don't actually need to call str explicity, as the
469 print command does this automatically:
470
471 >>> print record
472 ID: YP_025292.1
473 Name: HokC
474 Description: toxic membrane protein, small
475 Number of features: 0
476 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
477
478 Note that long sequences are shown truncated.
479 """
480 lines = []
481 if self.id : lines.append("ID: %s" % self.id)
482 if self.name : lines.append("Name: %s" % self.name)
483 if self.description : lines.append("Description: %s" % self.description)
484 if self.dbxrefs : lines.append("Database cross-references: " \
485 + ", ".join(self.dbxrefs))
486 lines.append("Number of features: %i" % len(self.features))
487 for a in self.annotations:
488 lines.append("/%s=%s" % (a, str(self.annotations[a])))
489 if self.letter_annotations :
490 lines.append("Per letter annotation for: " \
491 + ", ".join(self.letter_annotations.keys()))
492
493
494 lines.append(repr(self.seq))
495 return "\n".join(lines)
496
498 """A concise summary of the record for debugging (string).
499
500 The python built in function repr works by calling the object's ___repr__
501 method. e.g.
502
503 >>> from Bio.Seq import Seq
504 >>> from Bio.SeqRecord import SeqRecord
505 >>> from Bio.Alphabet import generic_protein
506 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
507 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
508 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
509 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF",
510 ... generic_protein),
511 ... id="NP_418483.1", name="b4059",
512 ... description="ssDNA-binding protein",
513 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
514 >>> print repr(rec)
515 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
516
517 At the python prompt you can also use this shorthand:
518
519 >>> rec
520 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
521
522 Note that long sequences are shown truncated.
523 """
524 return self.__class__.__name__ \
525 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \
526 % tuple(map(repr, (self.seq, self.id, self.name,
527 self.description, self.dbxrefs)))
528
562
580
582 """Returns the length of the sequence.
583
584 For example, using Bio.SeqIO to read in a FASTA nucleotide file:
585
586 >>> from Bio import SeqIO
587 >>> record = SeqIO.read(open("Nucleic/sweetpea.nu"),"fasta")
588 >>> len(record)
589 309
590 >>> len(record.seq)
591 309
592 """
593 return len(self.seq)
594
596 """Returns True regardless of the length of the sequence.
597
598 This behaviour is for backwards compatibility, since until the
599 __len__ method was added, a SeqRecord always evaluated as True.
600
601 Note that in comparison, a Seq object will evaluate to False if it
602 has a zero length sequence.
603
604 WARNING: The SeqRecord may in future evaluate to False when its
605 sequence is of zero length (in order to better match the Seq
606 object behaviour)!
607 """
608 return True
609
611 """Run the Bio.SeqRecord module's doctests (PRIVATE).
612
613 This will try and locate the unit tests directory, and run the doctests
614 from there in order that the relative paths used in the examples work.
615 """
616 import doctest
617 import os
618 if os.path.isdir(os.path.join("..","Tests")) :
619 print "Runing doctests..."
620 cur_dir = os.path.abspath(os.curdir)
621 os.chdir(os.path.join("..","Tests"))
622 doctest.testmod()
623 os.chdir(cur_dir)
624 del cur_dir
625 print "Done"
626
627 if __name__ == "__main__":
628 _test()
629