1
2
3
4
5
6
7
8
9
10 """Implementations of Biopython-like Seq objects on top of BioSQL.
11
12 This allows retrival of items stored in a BioSQL database using
13 a biopython-like Seq interface.
14 """
15
16 from Bio import Alphabet
17 from Bio.Seq import Seq, UnknownSeq
18 from Bio.SeqRecord import SeqRecord
19 from Bio import SeqFeature
20
22 - def __init__(self, primary_id, adaptor, alphabet, start, length):
23 """Create a new DBSeq object referring to a BioSQL entry.
24
25 You wouldn't normally create a DBSeq object yourself, this is done
26 for you when retreiving a DBSeqRecord object from the database.
27 """
28 self.primary_id = primary_id
29 self.adaptor = adaptor
30 self.alphabet = alphabet
31 self._length = length
32 self.start = start
33
36
38
39
40
41 if isinstance(index, int) :
42
43 i = index
44 if i < 0:
45 if -i > self._length:
46 raise IndexError(i)
47 i = i + self._length
48 elif i >= self._length:
49 raise IndexError(i)
50 return self.adaptor.get_subseq_as_string(self.primary_id,
51 self.start + i,
52 self.start + i + 1)
53 if not isinstance(index, slice) :
54 raise ValueError("Unexpected index type")
55
56
57
58 if index.start is None :
59 i=0
60 else :
61 i = index.start
62 if i < 0 :
63
64 if -i > self._length:
65 raise IndexError(i)
66 i = i + self._length
67 elif i >= self._length:
68
69 i = self._length
70
71 if index.stop is None :
72 j = self._length
73 else :
74 j = index.stop
75 if j < 0 :
76
77 if -j > self._length:
78 raise IndexError(j)
79 j = j + self._length
80 elif j >= self._length:
81 j = self._length
82
83 if i >= j:
84
85 return Seq("", self.alphabet)
86 elif index.step is None or index.step == 1 :
87
88 return self.__class__(self.primary_id, self.adaptor, self.alphabet,
89 self.start + i, j - i)
90 else :
91
92 full = self.adaptor.get_subseq_as_string(self.primary_id,
93 self.start + i,
94 self.start + j)
95 return Seq(full[::index.step], self.alphabet)
96
98 """Returns the full sequence as a python string.
99
100 Although not formally deprecated, you are now encouraged to use
101 str(my_seq) instead of my_seq.tostring()."""
102 return self.adaptor.get_subseq_as_string(self.primary_id,
103 self.start,
104 self.start + self._length)
110
111 data = property(tostring, doc="Sequence as string (DEPRECATED)")
112
114 """Returns the full sequence as a Seq object."""
115
116 return Seq(str(self), self.alphabet)
117
119
120 return self.toseq() + other
121
123
124 return other + self.toseq()
125
126
128
129
130
131
132
133
134 seqs = adaptor.execute_and_fetchall(
135 "SELECT alphabet, length, length(seq) FROM biosequence" \
136 " WHERE bioentry_id = %s", (primary_id,))
137 if not seqs : return
138 assert len(seqs) == 1
139 moltype, given_length, length = seqs[0]
140
141 try :
142 length = int(length)
143 given_length = int(length)
144 assert length == given_length
145 have_seq = True
146 except TypeError :
147 assert length is None
148 seqs = adaptor.execute_and_fetchall(
149 "SELECT alphabet, length, seq FROM biosequence" \
150 " WHERE bioentry_id = %s", (primary_id,))
151 assert len(seqs) == 1
152 moltype, given_length, seq = seqs[0]
153 assert seq is None or seq==""
154 length = int(given_length)
155 have_seq = False
156 del seq
157 del given_length
158
159 moltype = moltype.lower()
160
161
162 if moltype == "dna":
163 alphabet = Alphabet.generic_dna
164 elif moltype == "rna":
165 alphabet = Alphabet.generic_rna
166 elif moltype == "protein":
167 alphabet = Alphabet.generic_protein
168 elif moltype == "unknown":
169
170
171 alphabet = Alphabet.single_letter_alphabet
172 else:
173 raise AssertionError("Unknown moltype: %s" % moltype)
174
175 if have_seq :
176 return DBSeq(primary_id, adaptor, alphabet, 0, int(length))
177 else :
178 return UnknownSeq(length, alphabet)
179
181 """Retrieve the database cross references for the sequence."""
182 _dbxrefs = []
183 dbxrefs = adaptor.execute_and_fetchall(
184 "SELECT dbname, accession, version" \
185 " FROM bioentry_dbxref join dbxref using (dbxref_id)" \
186 " WHERE bioentry_id = %s" \
187 " ORDER BY rank", (primary_id,))
188 for dbname, accession, version in dbxrefs:
189 if version and version != "0":
190 v = "%s.%s" % (accession, version)
191 else:
192 v = accession
193 _dbxrefs.append("%s:%s" % (dbname, v))
194 return _dbxrefs
195
197 sql = "SELECT seqfeature_id, type.name, rank" \
198 " FROM seqfeature join term type on (type_term_id = type.term_id)" \
199 " WHERE bioentry_id = %s" \
200 " ORDER BY rank"
201 results = adaptor.execute_and_fetchall(sql, (primary_id,))
202 seq_feature_list = []
203 for seqfeature_id, seqfeature_type, seqfeature_rank in results:
204
205 qvs = adaptor.execute_and_fetchall(
206 "SELECT name, value" \
207 " FROM seqfeature_qualifier_value join term using (term_id)" \
208 " WHERE seqfeature_id = %s" \
209 " ORDER BY rank", (seqfeature_id,))
210 qualifiers = {}
211 for qv_name, qv_value in qvs:
212 qualifiers.setdefault(qv_name, []).append(qv_value)
213
214 qvs = adaptor.execute_and_fetchall(
215 "SELECT dbxref.dbname, dbxref.accession" \
216 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" \
217 " WHERE seqfeature_dbxref.seqfeature_id = %s" \
218 " ORDER BY rank", (seqfeature_id,))
219 for qv_name, qv_value in qvs:
220 value = "%s:%s" % (qv_name, qv_value)
221 qualifiers.setdefault("db_xref", []).append(value)
222
223 results = adaptor.execute_and_fetchall(
224 "SELECT location_id, start_pos, end_pos, strand" \
225 " FROM location" \
226 " WHERE seqfeature_id = %s" \
227 " ORDER BY rank", (seqfeature_id,))
228 locations = []
229
230
231
232
233
234
235 for location_id, start, end, strand in results:
236 if start:
237 start -= 1
238 if strand == 0:
239 strand = None
240 locations.append( (location_id, start, end, strand) )
241
242 remote_results = adaptor.execute_and_fetchall(
243 "SELECT location_id, dbname, accession, version" \
244 " FROM location join dbxref using (dbxref_id)" \
245 " WHERE seqfeature_id = %s", (seqfeature_id,))
246 lookup = {}
247 for location_id, dbname, accession, version in remote_results:
248 if version and version != "0":
249 v = "%s.%s" % (accession, version)
250 else:
251 v = accession
252
253
254 if dbname == "":
255 dbname = None
256 lookup[location_id] = (dbname, v)
257
258 feature = SeqFeature.SeqFeature(type = seqfeature_type)
259 feature._seqfeature_id = seqfeature_id
260 feature.qualifiers = qualifiers
261 if len(locations) == 0:
262 pass
263 elif len(locations) == 1:
264 location_id, start, end, strand = locations[0]
265
266
267 feature.location_operator = \
268 _retrieve_location_qualifier_value(adaptor, location_id)
269 dbname, version = lookup.get(location_id, (None, None))
270 feature.location = SeqFeature.FeatureLocation(start, end)
271 feature.strand = strand
272 feature.ref_db = dbname
273 feature.ref = version
274 else:
275 assert feature.sub_features == []
276 for location in locations:
277 location_id, start, end, strand = location
278 dbname, version = lookup.get(location_id, (None, None))
279 subfeature = SeqFeature.SeqFeature()
280 subfeature.type = seqfeature_type
281 subfeature.location_operator = \
282 _retrieve_location_qualifier_value(adaptor, location_id)
283
284
285
286 if not subfeature.location_operator :
287 subfeature.location_operator="join"
288 subfeature.location = SeqFeature.FeatureLocation(start, end)
289 subfeature.strand = strand
290 subfeature.ref_db = dbname
291 subfeature.ref = version
292 feature.sub_features.append(subfeature)
293
294
295 feature.location_operator = \
296 feature.sub_features[0].location_operator
297
298
299 start = locations[0][1]
300 end = locations[-1][2]
301 feature.location = SeqFeature.FeatureLocation(start, end)
302 feature.strand = feature.sub_features[0].strand
303
304 seq_feature_list.append(feature)
305
306 return seq_feature_list
307
309 value = adaptor.execute_and_fetch_col0(
310 "SELECT value FROM location_qualifier_value" \
311 " WHERE location_id = %s", (location_id,))
312 try:
313 return value[0]
314 except IndexError:
315 return ""
316
324
326 qvs = adaptor.execute_and_fetchall(
327 "SELECT name, value" \
328 " FROM bioentry_qualifier_value JOIN term USING (term_id)" \
329 " WHERE bioentry_id = %s" \
330 " ORDER BY rank", (primary_id,))
331 qualifiers = {}
332 for name, value in qvs:
333 if name == "keyword": name = "keywords"
334 elif name == "date_changed": name = "dates"
335 elif name == "secondary_accession": name = "accessions"
336 qualifiers.setdefault(name, []).append(value)
337 return qualifiers
338
340
341
342 refs = adaptor.execute_and_fetchall(
343 "SELECT start_pos, end_pos, " \
344 " location, title, authors," \
345 " dbname, accession" \
346 " FROM bioentry_reference" \
347 " JOIN reference USING (reference_id)" \
348 " LEFT JOIN dbxref USING (dbxref_id)" \
349 " WHERE bioentry_id = %s" \
350 " ORDER BY rank", (primary_id,))
351 references = []
352 for start, end, location, title, authors, dbname, accession in refs:
353 reference = SeqFeature.Reference()
354 if start: start -= 1
355 reference.location = [SeqFeature.FeatureLocation(start, end)]
356
357 if authors : reference.authors = authors
358 if title : reference.title = title
359 reference.journal = location
360 if dbname == 'PUBMED':
361 reference.pubmed_id = accession
362 elif dbname == 'MEDLINE':
363 reference.medline_id = accession
364 references.append(reference)
365 if references :
366 return {'references': references}
367 else :
368 return {}
369
371 a = {}
372 common_names = adaptor.execute_and_fetch_col0(
373 "SELECT name FROM taxon_name WHERE taxon_id = %s" \
374 " AND name_class = 'genbank common name'", (taxon_id,))
375 if common_names:
376 a['source'] = common_names[0]
377 scientific_names = adaptor.execute_and_fetch_col0(
378 "SELECT name FROM taxon_name WHERE taxon_id = %s" \
379 " AND name_class = 'scientific name'", (taxon_id,))
380 if scientific_names:
381 a['organism'] = scientific_names[0]
382 ncbi_taxids = adaptor.execute_and_fetch_col0(
383 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,))
384 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0":
385 a['ncbi_taxid'] = ncbi_taxids[0]
386
387
388
389
390
391
392
393
394
395
396 taxonomy = []
397 while taxon_id :
398 name, rank, parent_taxon_id = adaptor.execute_one(
399 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" \
400 " FROM taxon, taxon_name" \
401 " WHERE taxon.taxon_id=taxon_name.taxon_id" \
402 " AND taxon_name.name_class='scientific name'" \
403 " AND taxon.taxon_id = %s", (taxon_id,))
404 if taxon_id == parent_taxon_id :
405
406
407
408 break
409 if rank != "no rank" :
410
411
412
413 taxonomy.insert(0, name)
414 taxon_id = parent_taxon_id
415
416 if taxonomy:
417 a['taxonomy'] = taxonomy
418 return a
419
431
433 """BioSQL equivalent of the biopython SeqRecord object.
434 """
435
436 - def __init__(self, adaptor, primary_id):
437 self._adaptor = adaptor
438 self._primary_id = primary_id
439
440 (self._biodatabase_id, self._taxon_id, self.name,
441 accession, version, self._identifier,
442 self._division, self.description) = self._adaptor.execute_one(
443 "SELECT biodatabase_id, taxon_id, name, accession, version," \
444 " identifier, division, description" \
445 " FROM bioentry" \
446 " WHERE bioentry_id = %s", (self._primary_id,))
447 if version and version != "0":
448 self.id = "%s.%s" % (accession, version)
449 else:
450 self.id = accession
451
453 if not hasattr(self, "_seq"):
454 self._seq = _retrieve_seq(self._adaptor, self._primary_id)
455 return self._seq
458 seq = property(__get_seq, __set_seq, __del_seq, "Seq object")
459
461 if not hasattr(self,"_dbxrefs"):
462 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id)
463 return self._dbxrefs
466 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs,
467 "Database cross references")
468
470 if not hasattr(self, "_features"):
471 self._features = _retrieve_features(self._adaptor,
472 self._primary_id)
473 return self._features
476 features = property(__get_features, __set_features, __del_features,
477 "Features")
478
480 if not hasattr(self, "_annotations"):
481 self._annotations = _retrieve_annotations(self._adaptor,
482 self._primary_id,
483 self._taxon_id)
484 if self._identifier:
485 self._annotations["gi"] = self._identifier
486 if self._division:
487 self._annotations["data_file_division"] = self._division
488 return self._annotations
491 annotations = property(__get_annotations, __set_annotations,
492 __del_annotations, "Annotations")
493