Package BioSQL :: Module BioSeq
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeq

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2008 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2008 copyright by Cymon J. Cox.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Note that BioSQL (including the database schema and scripts) is 
  9  # available and licensed separately.  Please consult www.biosql.org 
 10  """Implementations of Biopython-like Seq objects on top of BioSQL. 
 11   
 12  This allows retrival of items stored in a BioSQL database using 
 13  a biopython-like Seq interface. 
 14  """ 
 15   
 16  from Bio import Alphabet 
 17  from Bio.Seq import Seq, UnknownSeq 
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio import SeqFeature 
 20   
21 -class DBSeq(Seq): # This implements the biopython Seq interface
22 - def __init__(self, primary_id, adaptor, alphabet, start, length):
23 """Create a new DBSeq object referring to a BioSQL entry. 24 25 You wouldn't normally create a DBSeq object yourself, this is done 26 for you when retreiving a DBSeqRecord object from the database. 27 """ 28 self.primary_id = primary_id 29 self.adaptor = adaptor 30 self.alphabet = alphabet 31 self._length = length 32 self.start = start
33
34 - def __len__(self):
35 return self._length
36
37 - def __getitem__(self, index) : # Seq API requirement
38 #Note since Python 2.0, __getslice__ is deprecated 39 #and __getitem__ is used instead. 40 #See http://docs.python.org/ref/sequence-methods.html 41 if isinstance(index, int) : 42 #Return a single letter as a string 43 i = index 44 if i < 0: 45 if -i > self._length: 46 raise IndexError(i) 47 i = i + self._length 48 elif i >= self._length: 49 raise IndexError(i) 50 return self.adaptor.get_subseq_as_string(self.primary_id, 51 self.start + i, 52 self.start + i + 1) 53 if not isinstance(index, slice) : 54 raise ValueError("Unexpected index type") 55 56 #Return the (sub)sequence as another DBSeq or Seq object 57 #(see the Seq obect's __getitem__ method) 58 if index.start is None : 59 i=0 60 else : 61 i = index.start 62 if i < 0 : 63 #Map to equavilent positive index 64 if -i > self._length: 65 raise IndexError(i) 66 i = i + self._length 67 elif i >= self._length: 68 #Trivial case, should return empty string! 69 i = self._length 70 71 if index.stop is None : 72 j = self._length 73 else : 74 j = index.stop 75 if j < 0 : 76 #Map to equavilent positive index 77 if -j > self._length: 78 raise IndexError(j) 79 j = j + self._length 80 elif j >= self._length: 81 j = self._length 82 83 if i >= j: 84 #Trivial case, empty string. 85 return Seq("", self.alphabet) 86 elif index.step is None or index.step == 1 : 87 #Easy case - can return a DBSeq with the start and end adjusted 88 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 89 self.start + i, j - i) 90 else : 91 #Tricky. Will have to create a Seq object because of the stride 92 full = self.adaptor.get_subseq_as_string(self.primary_id, 93 self.start + i, 94 self.start + j) 95 return Seq(full[::index.step], self.alphabet) 96
97 - def tostring(self):
98 """Returns the full sequence as a python string. 99 100 Although not formally deprecated, you are now encouraged to use 101 str(my_seq) instead of my_seq.tostring().""" 102 return self.adaptor.get_subseq_as_string(self.primary_id, 103 self.start, 104 self.start + self._length)
105 - def __str__(self):
106 """Returns the full sequence as a python string.""" 107 return self.adaptor.get_subseq_as_string(self.primary_id, 108 self.start, 109 self.start + self._length)
110 111 data = property(tostring, doc="Sequence as string (DEPRECATED)") 112
113 - def toseq(self):
114 """Returns the full sequence as a Seq object.""" 115 #Note - the method name copies that of the MutableSeq object 116 return Seq(str(self), self.alphabet)
117
118 - def __add__(self, other) :
119 #Let the Seq object deal with the alphabet issues etc 120 return self.toseq() + other
121
122 - def __radd__(self, other) :
123 #Let the Seq object deal with the alphabet issues etc 124 return other + self.toseq()
125 126
127 -def _retrieve_seq(adaptor, primary_id):
128 #The database schema ensures there will be only one matching 129 #row in the table. 130 131 #If an UnknownSeq was recorded, seq will be NULL, 132 #but length will be populated. This means length(seq) 133 #will return None. 134 seqs = adaptor.execute_and_fetchall( 135 "SELECT alphabet, length, length(seq) FROM biosequence" \ 136 " WHERE bioentry_id = %s", (primary_id,)) 137 if not seqs : return 138 assert len(seqs) == 1 139 moltype, given_length, length = seqs[0] 140 141 try : 142 length = int(length) 143 given_length = int(length) 144 assert length == given_length 145 have_seq = True 146 except TypeError : 147 assert length is None 148 seqs = adaptor.execute_and_fetchall( 149 "SELECT alphabet, length, seq FROM biosequence" \ 150 " WHERE bioentry_id = %s", (primary_id,)) 151 assert len(seqs) == 1 152 moltype, given_length, seq = seqs[0] 153 assert seq is None or seq=="" 154 length = int(given_length) 155 have_seq = False 156 del seq 157 del given_length 158 159 moltype = moltype.lower() #might be upper case in database 160 #We have no way of knowing if these sequences will use IUPAC 161 #alphabets, and we certainly can't assume they are unambiguous! 162 if moltype == "dna": 163 alphabet = Alphabet.generic_dna 164 elif moltype == "rna": 165 alphabet = Alphabet.generic_rna 166 elif moltype == "protein": 167 alphabet = Alphabet.generic_protein 168 elif moltype == "unknown": 169 #This is used in BioSQL/Loader.py and would happen 170 #for any generic or nucleotide alphabets. 171 alphabet = Alphabet.single_letter_alphabet 172 else: 173 raise AssertionError("Unknown moltype: %s" % moltype) 174 175 if have_seq : 176 return DBSeq(primary_id, adaptor, alphabet, 0, int(length)) 177 else : 178 return UnknownSeq(length, alphabet)
179
180 -def _retrieve_dbxrefs(adaptor, primary_id):
181 """Retrieve the database cross references for the sequence.""" 182 _dbxrefs = [] 183 dbxrefs = adaptor.execute_and_fetchall( 184 "SELECT dbname, accession, version" \ 185 " FROM bioentry_dbxref join dbxref using (dbxref_id)" \ 186 " WHERE bioentry_id = %s" \ 187 " ORDER BY rank", (primary_id,)) 188 for dbname, accession, version in dbxrefs: 189 if version and version != "0": 190 v = "%s.%s" % (accession, version) 191 else: 192 v = accession 193 _dbxrefs.append("%s:%s" % (dbname, v)) 194 return _dbxrefs
195
196 -def _retrieve_features(adaptor, primary_id):
197 sql = "SELECT seqfeature_id, type.name, rank" \ 198 " FROM seqfeature join term type on (type_term_id = type.term_id)" \ 199 " WHERE bioentry_id = %s" \ 200 " ORDER BY rank" 201 results = adaptor.execute_and_fetchall(sql, (primary_id,)) 202 seq_feature_list = [] 203 for seqfeature_id, seqfeature_type, seqfeature_rank in results: 204 # Get qualifiers [except for db_xref which is stored separately] 205 qvs = adaptor.execute_and_fetchall( 206 "SELECT name, value" \ 207 " FROM seqfeature_qualifier_value join term using (term_id)" \ 208 " WHERE seqfeature_id = %s" \ 209 " ORDER BY rank", (seqfeature_id,)) 210 qualifiers = {} 211 for qv_name, qv_value in qvs: 212 qualifiers.setdefault(qv_name, []).append(qv_value) 213 # Get db_xrefs [special case of qualifiers] 214 qvs = adaptor.execute_and_fetchall( 215 "SELECT dbxref.dbname, dbxref.accession" \ 216 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" \ 217 " WHERE seqfeature_dbxref.seqfeature_id = %s" \ 218 " ORDER BY rank", (seqfeature_id,)) 219 for qv_name, qv_value in qvs: 220 value = "%s:%s" % (qv_name, qv_value) 221 qualifiers.setdefault("db_xref", []).append(value) 222 # Get locations 223 results = adaptor.execute_and_fetchall( 224 "SELECT location_id, start_pos, end_pos, strand" \ 225 " FROM location" \ 226 " WHERE seqfeature_id = %s" \ 227 " ORDER BY rank", (seqfeature_id,)) 228 locations = [] 229 # convert to Python standard form 230 # Convert strand = 0 to strand = None 231 # re: comment in Loader.py: 232 # Biopython uses None when we don't know strand information but 233 # BioSQL requires something (non null) and sets this as zero 234 # So we'll use the strand or 0 if Biopython spits out None 235 for location_id, start, end, strand in results: 236 if start: 237 start -= 1 238 if strand == 0: 239 strand = None 240 locations.append( (location_id, start, end, strand) ) 241 # Get possible remote reference information 242 remote_results = adaptor.execute_and_fetchall( 243 "SELECT location_id, dbname, accession, version" \ 244 " FROM location join dbxref using (dbxref_id)" \ 245 " WHERE seqfeature_id = %s", (seqfeature_id,)) 246 lookup = {} 247 for location_id, dbname, accession, version in remote_results: 248 if version and version != "0": 249 v = "%s.%s" % (accession, version) 250 else: 251 v = accession 252 # subfeature remote location db_ref are stored as a empty string when 253 # not present 254 if dbname == "": 255 dbname = None 256 lookup[location_id] = (dbname, v) 257 258 feature = SeqFeature.SeqFeature(type = seqfeature_type) 259 feature._seqfeature_id = seqfeature_id #Store the key as a private property 260 feature.qualifiers = qualifiers 261 if len(locations) == 0: 262 pass 263 elif len(locations) == 1: 264 location_id, start, end, strand = locations[0] 265 #See Bug 2677, we currently don't record the location_operator 266 #For consistency with older versions Biopython, default to "". 267 feature.location_operator = \ 268 _retrieve_location_qualifier_value(adaptor, location_id) 269 dbname, version = lookup.get(location_id, (None, None)) 270 feature.location = SeqFeature.FeatureLocation(start, end) 271 feature.strand = strand 272 feature.ref_db = dbname 273 feature.ref = version 274 else: 275 assert feature.sub_features == [] 276 for location in locations: 277 location_id, start, end, strand = location 278 dbname, version = lookup.get(location_id, (None, None)) 279 subfeature = SeqFeature.SeqFeature() 280 subfeature.type = seqfeature_type 281 subfeature.location_operator = \ 282 _retrieve_location_qualifier_value(adaptor, location_id) 283 #TODO - See Bug 2677 - we don't yet record location_operator, 284 #so for consistency with older versions of Biopython default 285 #to assuming its a join. 286 if not subfeature.location_operator : 287 subfeature.location_operator="join" 288 subfeature.location = SeqFeature.FeatureLocation(start, end) 289 subfeature.strand = strand 290 subfeature.ref_db = dbname 291 subfeature.ref = version 292 feature.sub_features.append(subfeature) 293 # Assuming that the feature loc.op is the same as the sub_feature 294 # loc.op: 295 feature.location_operator = \ 296 feature.sub_features[0].location_operator 297 # Locations are in order, but because of remote locations for 298 # sub-features they are not necessarily in numerical order: 299 start = locations[0][1] 300 end = locations[-1][2] 301 feature.location = SeqFeature.FeatureLocation(start, end) 302 feature.strand = feature.sub_features[0].strand 303 304 seq_feature_list.append(feature) 305 306 return seq_feature_list
307
308 -def _retrieve_location_qualifier_value(adaptor, location_id):
309 value = adaptor.execute_and_fetch_col0( 310 "SELECT value FROM location_qualifier_value" \ 311 " WHERE location_id = %s", (location_id,)) 312 try: 313 return value[0] 314 except IndexError: 315 return ""
316
317 -def _retrieve_annotations(adaptor, primary_id, taxon_id):
318 annotations = {} 319 annotations.update(_retrieve_qualifier_value(adaptor, primary_id)) 320 annotations.update(_retrieve_reference(adaptor, primary_id)) 321 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id)) 322 annotations.update(_retrieve_comment(adaptor, primary_id)) 323 return annotations
324
325 -def _retrieve_qualifier_value(adaptor, primary_id):
326 qvs = adaptor.execute_and_fetchall( 327 "SELECT name, value" \ 328 " FROM bioentry_qualifier_value JOIN term USING (term_id)" \ 329 " WHERE bioentry_id = %s" \ 330 " ORDER BY rank", (primary_id,)) 331 qualifiers = {} 332 for name, value in qvs: 333 if name == "keyword": name = "keywords" 334 elif name == "date_changed": name = "dates" 335 elif name == "secondary_accession": name = "accessions" 336 qualifiers.setdefault(name, []).append(value) 337 return qualifiers
338
339 -def _retrieve_reference(adaptor, primary_id):
340 # XXX dbxref_qualifier_value 341 342 refs = adaptor.execute_and_fetchall( 343 "SELECT start_pos, end_pos, " \ 344 " location, title, authors," \ 345 " dbname, accession" \ 346 " FROM bioentry_reference" \ 347 " JOIN reference USING (reference_id)" \ 348 " LEFT JOIN dbxref USING (dbxref_id)" \ 349 " WHERE bioentry_id = %s" \ 350 " ORDER BY rank", (primary_id,)) 351 references = [] 352 for start, end, location, title, authors, dbname, accession in refs: 353 reference = SeqFeature.Reference() 354 if start: start -= 1 355 reference.location = [SeqFeature.FeatureLocation(start, end)] 356 #Don't replace the default "" with None. 357 if authors : reference.authors = authors 358 if title : reference.title = title 359 reference.journal = location 360 if dbname == 'PUBMED': 361 reference.pubmed_id = accession 362 elif dbname == 'MEDLINE': 363 reference.medline_id = accession 364 references.append(reference) 365 if references : 366 return {'references': references} 367 else : 368 return {}
369
370 -def _retrieve_taxon(adaptor, primary_id, taxon_id):
371 a = {} 372 common_names = adaptor.execute_and_fetch_col0( 373 "SELECT name FROM taxon_name WHERE taxon_id = %s" \ 374 " AND name_class = 'genbank common name'", (taxon_id,)) 375 if common_names: 376 a['source'] = common_names[0] 377 scientific_names = adaptor.execute_and_fetch_col0( 378 "SELECT name FROM taxon_name WHERE taxon_id = %s" \ 379 " AND name_class = 'scientific name'", (taxon_id,)) 380 if scientific_names: 381 a['organism'] = scientific_names[0] 382 ncbi_taxids = adaptor.execute_and_fetch_col0( 383 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)) 384 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0": 385 a['ncbi_taxid'] = ncbi_taxids[0] 386 387 #Old code used the left/right values in the taxon table to get the 388 #taxonomy lineage in one SQL command. This was actually very slow, 389 #and would fail if the (optional) left/right values were missing. 390 # 391 #The following code is based on a contribution from Eric Gibert, and 392 #relies on the taxon table's parent_taxon_id field only (ignoring the 393 #optional left/right values). This means that it has to make a 394 #separate SQL query for each entry in the lineage, but it does still 395 #appear to be *much* faster. See Bug 2494. 396 taxonomy = [] 397 while taxon_id : 398 name, rank, parent_taxon_id = adaptor.execute_one( 399 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" \ 400 " FROM taxon, taxon_name" \ 401 " WHERE taxon.taxon_id=taxon_name.taxon_id" \ 402 " AND taxon_name.name_class='scientific name'" \ 403 " AND taxon.taxon_id = %s", (taxon_id,)) 404 if taxon_id == parent_taxon_id : 405 # If the taxon table has been populated by the BioSQL script 406 # load_ncbi_taxonomy.pl this is how top parent nodes are stored. 407 # Personally, I would have used a NULL parent_taxon_id here. 408 break 409 if rank != "no rank" : 410 #For consistency with older versions of Biopython, we are only 411 #interested in taxonomy entries with a stated rank. 412 #Add this to the start of the lineage list. 413 taxonomy.insert(0, name) 414 taxon_id = parent_taxon_id 415 416 if taxonomy: 417 a['taxonomy'] = taxonomy 418 return a
419
420 -def _retrieve_comment(adaptor, primary_id):
421 qvs = adaptor.execute_and_fetchall( 422 "SELECT comment_text FROM comment" \ 423 " WHERE bioentry_id=%s" \ 424 " ORDER BY rank", (primary_id,)) 425 comments = [comm[0] for comm in qvs] 426 #Don't want to add an empty list... 427 if comments : 428 return {"comment": comments} 429 else : 430 return {}
431
432 -class DBSeqRecord(SeqRecord):
433 """BioSQL equivalent of the biopython SeqRecord object. 434 """ 435
436 - def __init__(self, adaptor, primary_id):
437 self._adaptor = adaptor 438 self._primary_id = primary_id 439 440 (self._biodatabase_id, self._taxon_id, self.name, 441 accession, version, self._identifier, 442 self._division, self.description) = self._adaptor.execute_one( 443 "SELECT biodatabase_id, taxon_id, name, accession, version," \ 444 " identifier, division, description" \ 445 " FROM bioentry" \ 446 " WHERE bioentry_id = %s", (self._primary_id,)) 447 if version and version != "0": 448 self.id = "%s.%s" % (accession, version) 449 else: 450 self.id = accession
451
452 - def __get_seq(self):
453 if not hasattr(self, "_seq"): 454 self._seq = _retrieve_seq(self._adaptor, self._primary_id) 455 return self._seq
456 - def __set_seq(self, seq): self._seq = seq
457 - def __del_seq(self): del self._seq
458 seq = property(__get_seq, __set_seq, __del_seq, "Seq object") 459
460 - def __get_dbxrefs(self):
461 if not hasattr(self,"_dbxrefs"): 462 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id) 463 return self._dbxrefs
464 - def __set_dbxrefs(self, dbxrefs): self._dbxrefs = dbxrefs
465 - def __del_dbxrefs(self): del self._dbxrefs
466 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, 467 "Database cross references") 468
469 - def __get_features(self):
470 if not hasattr(self, "_features"): 471 self._features = _retrieve_features(self._adaptor, 472 self._primary_id) 473 return self._features
474 - def __set_features(self, features): self._features = features
475 - def __del_features(self): del self._features
476 features = property(__get_features, __set_features, __del_features, 477 "Features") 478
479 - def __get_annotations(self):
480 if not hasattr(self, "_annotations"): 481 self._annotations = _retrieve_annotations(self._adaptor, 482 self._primary_id, 483 self._taxon_id) 484 if self._identifier: 485 self._annotations["gi"] = self._identifier 486 if self._division: 487 self._annotations["data_file_division"] = self._division 488 return self._annotations
489 - def __set_annotations(self, annotations): self._annotations = annotations
490 - def __del_annotations(self): del self._annotations
491 annotations = property(__get_annotations, __set_annotations, 492 __del_annotations, "Annotations")
493