1
2
3
4
5
6
7
8 """Represent a Sequence Record, a sequence with annotation."""
9 __docformat__ = "epytext en"
10
11
12
13
14
15
17 """Dict which only allows sequences of given length as values (PRIVATE).
18
19 This simple subclass of the Python dictionary is used in the SeqRecord
20 object for holding per-letter-annotations. This class is intended to
21 prevent simple errors by only allowing python sequences (e.g. lists,
22 strings and tuples) to be stored, and only if their length matches that
23 expected (the length of the SeqRecord's seq object). It cannot however
24 prevent the entries being edited in situ (for example appending entries
25 to a list).
26 """
32 if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \
33 or len(value) != self._length :
34 raise TypeError("We only allow python sequences (lists, tuples or "
35 "strings) of length %i." % self._length)
36 dict.__setitem__(self, key, value)
41
43 """A SeqRecord object holds a sequence and information about it.
44
45 Main attributes:
46 - id - Identifier such as a locus tag (string)
47 - seq - The sequence itself (Seq object or similar)
48
49 Additional attributes:
50 - name - Sequence name, e.g. gene name (string)
51 - description - Additional text (string)
52 - dbxrefs - List of database cross references (list of strings)
53 - features - Any (sub)features defined (list of SeqFeature objects)
54 - annotations - Further information about the whole sequence (dictionary)
55 Most entries are strings, or lists of strings.
56 - letter_annotations - Per letter/symbol annotation (restricted
57 dictionary). This holds Python sequences (lists, strings
58 or tuples) whose length matches that of the sequence.
59 A typical use would be to hold a list of integers
60 representing sequencing quality scores, or a string
61 representing the secondary structure.
62
63 You will typically use Bio.SeqIO to read in sequences from files as
64 SeqRecord objects. However, you may want to create your own SeqRecord
65 objects directly (see the __init__ method for further details):
66
67 >>> from Bio.Seq import Seq
68 >>> from Bio.SeqRecord import SeqRecord
69 >>> from Bio.Alphabet import IUPAC
70 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
71 ... IUPAC.protein),
72 ... id="YP_025292.1", name="HokC",
73 ... description="toxic membrane protein")
74 >>> print record
75 ID: YP_025292.1
76 Name: HokC
77 Description: toxic membrane protein
78 Number of features: 0
79 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
80
81 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
82 for this. For the special case where you want the SeqRecord turned into
83 a string in a particular file format there is a format method which uses
84 Bio.SeqIO internally:
85
86 >>> print record.format("fasta")
87 >YP_025292.1 toxic membrane protein
88 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
89 <BLANKLINE>
90 """
91 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>",
92 description = "<unknown description>", dbxrefs = None,
93 features = None, annotations = None,
94 letter_annotations = None):
95 """Create a SeqRecord.
96
97 Arguments:
98 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq)
99 - id - Sequence identifier, recommended (string)
100 - name - Sequence name, optional (string)
101 - description - Sequence description, optional (string)
102 - dbxrefs - Database cross references, optional (list of strings)
103 - features - Any (sub)features, optional (list of SeqFeature objects)
104 - annotations - Dictionary of annotations for the whole sequence
105 - letter_annotations - Dictionary of per-letter-annotations, values
106 should be strings, list or tuples of the same
107 length as the full sequence.
108
109 You will typically use Bio.SeqIO to read in sequences from files as
110 SeqRecord objects. However, you may want to create your own SeqRecord
111 objects directly.
112
113 Note that while an id is optional, we strongly recommend you supply a
114 unique id string for each record. This is especially important
115 if you wish to write your sequences to a file.
116
117 If you don't have the actual sequence, but you do know its length,
118 then using the UnknownSeq object from Bio.Seq is appropriate.
119
120 You can create a 'blank' SeqRecord object, and then populate the
121 attributes later.
122 """
123 if id is not None and not isinstance(id, basestring) :
124
125 raise TypeError("id argument should be a string")
126 if not isinstance(name, basestring) :
127 raise TypeError("name argument should be a string")
128 if not isinstance(description, basestring) :
129 raise TypeError("description argument should be a string")
130 self._seq = seq
131 self.id = id
132 self.name = name
133 self.description = description
134
135
136 if dbxrefs is None:
137 dbxrefs = []
138 elif not isinstance(dbxrefs, list) :
139 raise TypeError("dbxrefs argument should be a list (of strings)")
140 self.dbxrefs = dbxrefs
141
142
143 if annotations is None:
144 annotations = {}
145 elif not isinstance(annotations, dict) :
146 raise TypeError("annotations argument should be a dict")
147 self.annotations = annotations
148
149 if letter_annotations is None:
150
151 if seq is None :
152
153 self._per_letter_annotations = _RestrictedDict(length=0)
154 else :
155 try :
156 self._per_letter_annotations = \
157 _RestrictedDict(length=len(seq))
158 except :
159 raise TypeError("seq argument should be a Seq object or similar")
160 else :
161
162
163
164 self.letter_annotations = letter_annotations
165
166
167 if features is None:
168 features = []
169 elif not isinstance(features, list) :
170 raise TypeError("features argument should be a list (of SeqFeature objects)")
171 self.features = features
172
173
175 if not isinstance(value, dict) :
176 raise TypeError("The per-letter-annotations should be a "
177 "(restricted) dictionary.")
178
179 try :
180 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
181 except AttributeError :
182
183 self._per_letter_annotations = _RestrictedDict(length=0)
184 self._per_letter_annotations.update(value)
185 letter_annotations = property( \
186 fget=lambda self : self._per_letter_annotations,
187 fset=_set_per_letter_annotations,
188 doc="""Dictionary of per-letter-annotation for the sequence.
189
190 For example, this can hold quality scores used in FASTQ or QUAL files.
191 Consider this example using Bio.SeqIO to read in an example Solexa
192 variant FASTQ file as a SeqRecord:
193
194 >>> from Bio import SeqIO
195 >>> handle = open("Quality/solexa_faked.fastq", "rU")
196 >>> record = SeqIO.read(handle, "fastq-solexa")
197 >>> handle.close()
198 >>> print record.id, record.seq
199 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
200 >>> print record.letter_annotations.keys()
201 ['solexa_quality']
202 >>> print record.letter_annotations["solexa_quality"]
203 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
204
205 The letter_annotations get sliced automatically if you slice the
206 parent SeqRecord, for example taking the last ten bases:
207
208 >>> sub_record = record[-10:]
209 >>> print sub_record.id, sub_record.seq
210 slxa_0001_1_0001_01 ACGTNNNNNN
211 >>> print sub_record.letter_annotations["solexa_quality"]
212 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
213
214 Any python sequence (i.e. list, tuple or string) can be recorded in
215 the SeqRecord's letter_annotations dictionary as long as the length
216 matches that of the SeqRecord's sequence. e.g.
217
218 >>> len(sub_record.letter_annotations)
219 1
220 >>> sub_record.letter_annotations["dummy"] = "abcdefghij"
221 >>> len(sub_record.letter_annotations)
222 2
223
224 You can delete entries from the letter_annotations dictionary as usual:
225
226 >>> del sub_record.letter_annotations["solexa_quality"]
227 >>> sub_record.letter_annotations
228 {'dummy': 'abcdefghij'}
229
230 You can completely clear the dictionary easily as follows:
231
232 >>> sub_record.letter_annotations = {}
233 >>> sub_record.letter_annotations
234 {}
235 """)
236
238
239 if self._per_letter_annotations :
240
241 raise ValueError("You must empty the letter annotations first!")
242 self._seq = value
243 try :
244 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
245 except AttributeError :
246
247 self._per_letter_annotations = _RestrictedDict(length=0)
248
249 seq = property(fget=lambda self : self._seq,
250 fset=_set_seq,
251 doc="The sequence itself, as a Seq or MutableSeq object.")
252
254 """Returns a sub-sequence or an individual letter.
255
256 Slicing, e.g. my_record[5:10], returns a new SeqRecord for
257 that sub-sequence with approriate annotation preserved. The
258 name, id and description are kept.
259
260 Any per-letter-annotations are sliced to match the requested
261 sub-sequence. Unless a stride is used, all those features
262 which fall fully within the subsequence are included (with
263 their locations adjusted accordingly).
264
265 However, the annotations dictionary and the dbxrefs list are
266 not used for the new SeqRecord, as in general they may not
267 apply to the subsequence. If you want to preserve them, you
268 must explictly copy them to the new SeqRecord yourself.
269
270 Using an integer index, e.g. my_record[5] is shorthand for
271 extracting that letter from the sequence, my_record.seq[5].
272
273 For example, consider this short protein and its secondary
274 structure as encoded by the PDB (e.g. H for alpha helices),
275 plus a simple feature for its histidine self phosphorylation
276 site:
277
278 >>> from Bio.Seq import Seq
279 >>> from Bio.SeqRecord import SeqRecord
280 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
281 >>> from Bio.Alphabet import IUPAC
282 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT"
283 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR",
284 ... IUPAC.protein),
285 ... id="1JOY", name="EnvZ",
286 ... description="Homodimeric domain of EnvZ from E. coli")
287 >>> rec.letter_annotations["secondary_structure"] = \
288 " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT "
289 >>> rec.features.append(SeqFeature(FeatureLocation(20,21),
290 ... type = "Site"))
291
292 Now let's have a quick look at the full record,
293
294 >>> print rec
295 ID: 1JOY
296 Name: EnvZ
297 Description: Homodimeric domain of EnvZ from E. coli
298 Number of features: 1
299 Per letter annotation for: secondary_structure
300 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
301 >>> print rec.letter_annotations["secondary_structure"]
302 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT
303 >>> print rec.features[0].location
304 [20:21]
305
306 Now let's take a sub sequence, here chosen as the first (fractured)
307 alpha helix which includes the histidine phosphorylation site:
308
309 >>> sub = rec[11:41]
310 >>> print sub
311 ID: 1JOY
312 Name: EnvZ
313 Description: Homodimeric domain of EnvZ from E. coli
314 Number of features: 1
315 Per letter annotation for: secondary_structure
316 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein())
317 >>> print sub.letter_annotations["secondary_structure"]
318 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH
319 >>> print sub.features[0].location
320 [9:10]
321
322 You can also of course omit the start or end values, for
323 example to get the first ten letters only:
324
325 >>> print rec[:10]
326 ID: 1JOY
327 Name: EnvZ
328 Description: Homodimeric domain of EnvZ from E. coli
329 Number of features: 0
330 Per letter annotation for: secondary_structure
331 Seq('MAAGVKQLAD', IUPACProtein())
332
333 Or for the last ten letters:
334
335 >>> print rec[-10:]
336 ID: 1JOY
337 Name: EnvZ
338 Description: Homodimeric domain of EnvZ from E. coli
339 Number of features: 0
340 Per letter annotation for: secondary_structure
341 Seq('IIEQFIDYLR', IUPACProtein())
342
343 If you omit both, then you get a copy of the original record (although
344 lacking the annotations and dbxrefs):
345
346 >>> print rec[:]
347 ID: 1JOY
348 Name: EnvZ
349 Description: Homodimeric domain of EnvZ from E. coli
350 Number of features: 1
351 Per letter annotation for: secondary_structure
352 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
353
354 Finally, indexing with a simple integer is shorthand for pulling out
355 that letter from the sequence directly:
356
357 >>> rec[5]
358 'K'
359 >>> rec.seq[5]
360 'K'
361 """
362 if isinstance(index, int) :
363
364
365
366 return self.seq[index]
367 elif isinstance(index, slice) :
368 if self.seq is None :
369 raise ValueError("If the sequence is None, we cannot slice it.")
370 parent_length = len(self)
371 answer = self.__class__(self.seq[index],
372 id=self.id,
373 name=self.name,
374 description=self.description)
375
376
377
378
379
380
381
382
383
384
385 if index.step is None or index.step == 1 :
386
387 if index.start is None :
388 start = 0
389 else :
390 start = index.start
391 if index.stop is None :
392 stop = -1
393 else :
394 stop = index.stop
395 if (start < 0 or stop < 0) and parent_length == 0 :
396 raise ValueError, \
397 "Cannot support negative indices without the sequence length"
398 if start < 0 :
399 start = parent_length - start
400 if stop < 0 :
401 stop = parent_length - stop + 1
402
403 for f in self.features :
404 if f.ref or f.ref_db :
405
406 import warnings
407 warnings.warn("When slicing SeqRecord objects, any "
408 "SeqFeature referencing other sequences (e.g. "
409 "from segmented GenBank records) is ignored.")
410 continue
411 if start <= f.location.start.position \
412 and f.location.end.position < stop :
413 answer.features.append(f._shift(-start))
414
415
416
417 for key, value in self.letter_annotations.iteritems() :
418 answer._per_letter_annotations[key] = value[index]
419
420 return answer
421 raise ValueError, "Invalid index"
422
424 """Iterate over the letters in the sequence.
425
426 For example, using Bio.SeqIO to read in a protein FASTA file:
427
428 >>> from Bio import SeqIO
429 >>> record = SeqIO.read(open("Amino/loveliesbleeding.pro"),"fasta")
430 >>> for amino in record :
431 ... print amino
432 ... if amino == "L" : break
433 X
434 A
435 G
436 L
437 >>> print record.seq[3]
438 L
439
440 This is just a shortcut for iterating over the sequence directly:
441
442 >>> for amino in record.seq :
443 ... print amino
444 ... if amino == "L" : break
445 X
446 A
447 G
448 L
449 >>> print record.seq[3]
450 L
451
452 Note that this does not facilitate iteration together with any
453 per-letter-annotation. However, you can achieve that using the
454 python zip function on the record (or its sequence) and the relevant
455 per-letter-annotation:
456
457 >>> from Bio import SeqIO
458 >>> rec = SeqIO.read(open("Quality/solexa_faked.fastq", "rU"),
459 ... "fastq-solexa")
460 >>> print rec.id, rec.seq
461 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
462 >>> print rec.letter_annotations.keys()
463 ['solexa_quality']
464 >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]) :
465 ... if qual > 35 :
466 ... print nuc, qual
467 A 40
468 C 39
469 G 38
470 T 37
471 A 36
472
473 You may agree that using zip(rec.seq, ...) is more explicit than using
474 zip(rec, ...) as shown above.
475 """
476 return iter(self.seq)
477
479 """Implements the 'in' keyword, searches the sequence.
480
481 e.g.
482
483 >>> from Bio import SeqIO
484 >>> record = SeqIO.read(open("Nucleic/sweetpea.nu"), "fasta")
485 >>> "GAATTC" in record
486 False
487 >>> "AAA" in record
488 True
489
490 This essentially acts as a proxy for using "in" on the sequence:
491
492 >>> "GAATTC" in record.seq
493 False
494 >>> "AAA" in record.seq
495 True
496
497 Note that you can also use Seq objects as the query,
498
499 >>> from Bio.Seq import Seq
500 >>> from Bio.Alphabet import generic_dna
501 >>> Seq("AAA") in record
502 True
503 >>> Seq("AAA", generic_dna) in record
504 True
505
506 See also the Seq object's __contains__ method.
507 """
508 return char in self.seq
509
510
512 """A human readable summary of the record and its annotation (string).
513
514 The python built in function str works by calling the object's ___str__
515 method. e.g.
516
517 >>> from Bio.Seq import Seq
518 >>> from Bio.SeqRecord import SeqRecord
519 >>> from Bio.Alphabet import IUPAC
520 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
521 ... IUPAC.protein),
522 ... id="YP_025292.1", name="HokC",
523 ... description="toxic membrane protein, small")
524 >>> print str(record)
525 ID: YP_025292.1
526 Name: HokC
527 Description: toxic membrane protein, small
528 Number of features: 0
529 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
530
531 In this example you don't actually need to call str explicity, as the
532 print command does this automatically:
533
534 >>> print record
535 ID: YP_025292.1
536 Name: HokC
537 Description: toxic membrane protein, small
538 Number of features: 0
539 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
540
541 Note that long sequences are shown truncated.
542 """
543 lines = []
544 if self.id : lines.append("ID: %s" % self.id)
545 if self.name : lines.append("Name: %s" % self.name)
546 if self.description : lines.append("Description: %s" % self.description)
547 if self.dbxrefs : lines.append("Database cross-references: " \
548 + ", ".join(self.dbxrefs))
549 lines.append("Number of features: %i" % len(self.features))
550 for a in self.annotations:
551 lines.append("/%s=%s" % (a, str(self.annotations[a])))
552 if self.letter_annotations :
553 lines.append("Per letter annotation for: " \
554 + ", ".join(self.letter_annotations.keys()))
555
556
557 lines.append(repr(self.seq))
558 return "\n".join(lines)
559
561 """A concise summary of the record for debugging (string).
562
563 The python built in function repr works by calling the object's ___repr__
564 method. e.g.
565
566 >>> from Bio.Seq import Seq
567 >>> from Bio.SeqRecord import SeqRecord
568 >>> from Bio.Alphabet import generic_protein
569 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
570 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
571 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
572 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF",
573 ... generic_protein),
574 ... id="NP_418483.1", name="b4059",
575 ... description="ssDNA-binding protein",
576 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
577 >>> print repr(rec)
578 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
579
580 At the python prompt you can also use this shorthand:
581
582 >>> rec
583 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
584
585 Note that long sequences are shown truncated. Also note that any
586 annotations, letter_annotations and features are not shown (as they
587 would lead to a very long string).
588 """
589 return self.__class__.__name__ \
590 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \
591 % tuple(map(repr, (self.seq, self.id, self.name,
592 self.description, self.dbxrefs)))
593
627
645
647 """Returns the length of the sequence.
648
649 For example, using Bio.SeqIO to read in a FASTA nucleotide file:
650
651 >>> from Bio import SeqIO
652 >>> record = SeqIO.read(open("Nucleic/sweetpea.nu"),"fasta")
653 >>> len(record)
654 309
655 >>> len(record.seq)
656 309
657 """
658 return len(self.seq)
659
661 """Returns True regardless of the length of the sequence.
662
663 This behaviour is for backwards compatibility, since until the
664 __len__ method was added, a SeqRecord always evaluated as True.
665
666 Note that in comparison, a Seq object will evaluate to False if it
667 has a zero length sequence.
668
669 WARNING: The SeqRecord may in future evaluate to False when its
670 sequence is of zero length (in order to better match the Seq
671 object behaviour)!
672 """
673 return True
674
676 """Run the Bio.SeqRecord module's doctests (PRIVATE).
677
678 This will try and locate the unit tests directory, and run the doctests
679 from there in order that the relative paths used in the examples work.
680 """
681 import doctest
682 import os
683 if os.path.isdir(os.path.join("..","Tests")) :
684 print "Runing doctests..."
685 cur_dir = os.path.abspath(os.curdir)
686 os.chdir(os.path.join("..","Tests"))
687 doctest.testmod()
688 os.chdir(cur_dir)
689 del cur_dir
690 print "Done"
691
692 if __name__ == "__main__":
693 _test()
694