Package BioSQL :: Module BioSeqDatabase
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeqDatabase

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2008 by Peter Cock. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  # 
  7  # Note that BioSQL (including the database schema and scripts) is 
  8  # available and licensed separately.  Please consult www.biosql.org 
  9  """Connect with a BioSQL database and load Biopython like objects from it. 
 10   
 11  This provides interfaces for loading biological objects from a relational 
 12  database, and is compatible with the BioSQL standards. 
 13  """ 
 14  import BioSeq 
 15  import Loader 
 16  import DBUtils 
 17   
18 -def open_database(driver = "MySQLdb", **kwargs):
19 """Main interface for loading a existing BioSQL-style database. 20 21 This function is the easiest way to retrieve a connection to a 22 database, doing something like: 23 24 >>> from BioSeq import BioSeqDatabase 25 >>> server = BioSeqDatabase.open_database(user = "root", db="minidb") 26 27 the various options are: 28 driver -> The name of the database driver to use for connecting. The 29 driver should implement the python DB API. By default, the MySQLdb 30 driver is used. 31 user -> the username to connect to the database with. 32 password, passwd -> the password to connect with 33 host -> the hostname of the database 34 database or db -> the name of the database 35 """ 36 module = __import__(driver) 37 connect = getattr(module, "connect") 38 39 # Different drivers use different keywords... 40 kw = kwargs.copy() 41 if driver == "MySQLdb": 42 if "database" in kw: 43 kw["db"] = kw["database"] 44 del kw["database"] 45 if "password" in kw: 46 kw["passwd"] = kw["password"] 47 del kw["password"] 48 else: 49 # DB-API recommendations 50 if "db" in kw: 51 kw["database"] = kw["db"] 52 del kw["db"] 53 if "passwd" in kw: 54 kw["password"] = kw["passwd"] 55 del kw["passwd"] 56 if driver in ["psycopg", "psycopg2"] and not kw.get("database"): 57 kw["database"] = "template1" 58 try: 59 conn = connect(**kw) 60 except module.InterfaceError: 61 # Ok, so let's try building a DSN 62 # (older releases of psycopg need this) 63 if "database" in kw: 64 kw["dbname"] = kw["database"] 65 del kw["database"] 66 elif "db" in kw: 67 kw["dbname"] = kw["db"] 68 del kw["db"] 69 70 dsn = ' '.join(['='.join(i) for i in kw.items()]) 71 conn = connect(dsn) 72 73 return DBServer(conn, module)
74
75 -class DBServer:
76 - def __init__(self, conn, module, module_name=None):
77 self.module = module 78 if module_name is None: 79 module_name = module.__name__ 80 self.adaptor = Adaptor(conn, DBUtils.get_dbutils(module_name)) 81 self.module_name = module_name
82
83 - def __repr__(self):
84 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
85 - def __getitem__(self, name):
86 return BioSeqDatabase(self.adaptor, name)
87 - def keys(self):
88 return self.adaptor.list_biodatabase_names()
89 - def values(self):
90 return [self[key] for key in self.keys()]
91 - def items(self):
92 return [(key, self[key]) for key in self.keys()]
93
94 - def remove_database(self, db_name):
95 """Try to remove all references to items in a database. 96 """ 97 db_id = self.adaptor.fetch_dbid_by_dbname(db_name) 98 remover = Loader.DatabaseRemover(self.adaptor, db_id) 99 remover.remove()
100
101 - def new_database(self, db_name, authority=None, description=None):
102 """Add a new database to the server and return it. 103 """ 104 # make the database 105 sql = r"INSERT INTO biodatabase (name, authority, description)" \ 106 r" VALUES (%s, %s, %s)" 107 self.adaptor.execute(sql, (db_name,authority, description)) 108 return BioSeqDatabase(self.adaptor, db_name)
109
110 - def load_database_sql(self, sql_file):
111 """Load a database schema into the given database. 112 113 This is used to create tables, etc when a database is first created. 114 sql_file should specify the complete path to a file containing 115 SQL entries for building the tables. 116 """ 117 # Not sophisticated enough for PG schema. Is it needed by MySQL? 118 # Looks like we need this more complicated way for both. Leaving it 119 # the default and removing the simple-minded approach. 120 121 # read the file with all comment lines removed 122 sql_handle = open(sql_file, "rb") 123 sql = r"" 124 for line in sql_handle.xreadlines(): 125 if line.find("--") == 0: # don't include comment lines 126 pass 127 elif line.find("#") == 0: # ditto for MySQL comments 128 pass 129 elif line.strip(): # only include non-blank lines 130 sql += line.strip() 131 sql += ' ' 132 133 # two ways to load the SQL 134 # 1. PostgreSQL can load it all at once and actually needs to 135 # due to FUNCTION defines at the end of the SQL which mess up 136 # the splitting by semicolons 137 if self.module_name in ["psycopg", "psycopg2"]: 138 self.adaptor.cursor.execute(sql) 139 # 2. MySQL needs the database loading split up into single lines of 140 # SQL executed one at a time 141 elif self.module_name in ["MySQLdb"]: 142 sql_parts = sql.split(";") # one line per sql command 143 for sql_line in sql_parts[:-1]: # don't use the last item, it's blank 144 self.adaptor.cursor.execute(sql_line) 145 else: 146 raise ValueError("Module %s not supported by the loader." % 147 (self.module_name))
148
149 - def commit(self):
150 """Commits the current transaction to the database.""" 151 return self.adaptor.commit()
152
153 - def rollback(self):
154 """Rolls backs the current transaction.""" 155 return self.adaptor.rollback()
156
157 - def close(self):
158 """Close the connection. No further activity possible.""" 159 return self.adaptor.close()
160
161 -class Adaptor:
162 - def __init__(self, conn, dbutils):
163 self.conn = conn 164 self.cursor = conn.cursor() 165 self.dbutils = dbutils
166
167 - def last_id(self, table):
168 return self.dbutils.last_id(self.cursor, table)
169
170 - def autocommit(self, y=True):
171 """Set the autocommit mode. True values enable; False value disable.""" 172 return self.dbutils.autocommit(self.conn, y)
173
174 - def commit(self):
175 """Commits the current transaction.""" 176 return self.conn.commit()
177
178 - def rollback(self):
179 """Rolls backs the current transaction.""" 180 return self.conn.rollback()
181
182 - def close(self):
183 """Close the connection. No further activity possible.""" 184 return self.conn.close()
185
186 - def fetch_dbid_by_dbname(self, dbname):
187 self.cursor.execute( 188 r"select biodatabase_id from biodatabase where name = %s", 189 (dbname,)) 190 rv = self.cursor.fetchall() 191 if not rv: 192 raise KeyError("Cannot find biodatabase with name %r" % dbname) 193 # Cannot happen (UK) 194 ## assert len(rv) == 1, "More than one biodatabase with name %r" % dbname 195 return rv[0][0]
196
197 - def fetch_seqid_by_display_id(self, dbid, name):
198 sql = r"select bioentry_id from bioentry where name = %s" 199 fields = [name] 200 if dbid: 201 sql += " and biodatabase_id = %s" 202 fields.append(dbid) 203 self.cursor.execute(sql, fields) 204 rv = self.cursor.fetchall() 205 if not rv: 206 raise IndexError("Cannot find display id %r" % name) 207 if len(rv) > 1: 208 raise IndexError("More than one entry with display id %r" % name) 209 return rv[0][0]
210
211 - def fetch_seqid_by_accession(self, dbid, name):
212 sql = r"select bioentry_id from bioentry where accession = %s" 213 fields = [name] 214 if dbid: 215 sql += " and biodatabase_id = %s" 216 fields.append(dbid) 217 self.cursor.execute(sql, fields) 218 rv = self.cursor.fetchall() 219 if not rv: 220 raise IndexError("Cannot find accession %r" % name) 221 if len(rv) > 1: 222 raise IndexError("More than one entry with accession %r" % name) 223 return rv[0][0]
224
225 - def fetch_seqids_by_accession(self, dbid, name):
226 sql = r"select bioentry_id from bioentry where accession = %s" 227 fields = [name] 228 if dbid: 229 sql += " and biodatabase_id = %s" 230 fields.append(dbid) 231 return self.execute_and_fetch_col0(sql, fields)
232
233 - def fetch_seqid_by_version(self, dbid, name):
234 acc_version = name.split(".") 235 if len(acc_version) > 2: 236 raise IndexError("Bad version %r" % name) 237 acc = acc_version[0] 238 if len(acc_version) == 2: 239 version = acc_version[1] 240 else: 241 version = "0" 242 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \ 243 r" AND version = %s" 244 fields = [acc, version] 245 if dbid: 246 sql += " and biodatabase_id = %s" 247 fields.append(dbid) 248 self.cursor.execute(sql, fields) 249 rv = self.cursor.fetchall() 250 if not rv: 251 raise IndexError("Cannot find version %r" % name) 252 if len(rv) > 1: 253 raise IndexError("More than one entry with version %r" % name) 254 return rv[0][0]
255
256 - def fetch_seqid_by_identifier(self, dbid, identifier):
257 # YB: was fetch_seqid_by_seqid 258 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s" 259 fields = [identifier] 260 if dbid: 261 sql += " and biodatabase_id = %s" 262 fields.append(dbid) 263 self.cursor.execute(sql, fields) 264 rv = self.cursor.fetchall() 265 if not rv: 266 raise IndexError("Cannot find display id %r" % identifier) 267 return rv[0][0]
268
269 - def list_biodatabase_names(self):
270 return self.execute_and_fetch_col0( 271 "SELECT name FROM biodatabase")
272
273 - def list_bioentry_ids(self, dbid):
274 return self.execute_and_fetch_col0( 275 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", 276 (dbid,))
277
278 - def list_bioentry_display_ids(self, dbid):
279 return self.execute_and_fetch_col0( 280 "SELECT name FROM bioentry WHERE biodatabase_id = %s", 281 (dbid,))
282
283 - def list_any_ids(self, sql, args):
284 """Return ids given a SQL statement to select for them. 285 286 This assumes that the given SQL does a SELECT statement that 287 returns a list of items. This parses them out of the 2D list 288 they come as and just returns them in a list. 289 """ 290 return self.cursor.execute_and_fetch_col0(sql, args)
291
292 - def execute_one(self, sql, args=None):
293 self.cursor.execute(sql, args or ()) 294 rv = self.cursor.fetchall() 295 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv) 296 return rv[0]
297
298 - def execute(self, sql, args=None):
299 """Just execute an sql command. 300 """ 301 self.cursor.execute(sql, args or ())
302
303 - def get_subseq_as_string(self, seqid, start, end):
304 length = end - start 305 return self.execute_one( 306 """select SUBSTRING(seq FROM %s FOR %s) 307 from biosequence where bioentry_id = %s""", 308 (start+1, length, seqid))[0]
309
310 - def execute_and_fetch_col0(self, sql, args=None):
311 self.cursor.execute(sql, args or ()) 312 return [field[0] for field in self.cursor.fetchall()]
313
314 - def execute_and_fetchall(self, sql, args=None):
315 self.cursor.execute(sql, args or ()) 316 return self.cursor.fetchall()
317 318 _allowed_lookups = { 319 # Lookup name / function name to get id, function to list all ids 320 'primary_id': "fetch_seqid_by_identifier", 321 'gi': "fetch_seqid_by_identifier", 322 'display_id': "fetch_seqid_by_display_id", 323 'name': "fetch_seqid_by_display_id", 324 'accession': "fetch_seqid_by_accession", 325 'version': "fetch_seqid_by_version", 326 } 327
328 -class BioSeqDatabase:
329 - def __init__(self, adaptor, name):
330 self.adaptor = adaptor 331 self.name = name 332 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
333 - def __repr__(self):
334 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
335
336 - def get_Seq_by_id(self, name):
337 """Gets a Bio::Seq object by its name 338 339 Example: seq = db.get_Seq_by_id('ROA1_HUMAN') 340 341 """ 342 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name) 343 return BioSeq.DBSeqRecord(self.adaptor, seqid)
344
345 - def get_Seq_by_acc(self, name):
346 """Gets a Bio::Seq object by accession number 347 348 Example: seq = db.get_Seq_by_acc('X77802') 349 350 """ 351 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name) 352 return BioSeq.DBSeqRecord(self.adaptor, seqid)
353
354 - def get_Seq_by_ver(self, name):
355 """Gets a Bio::Seq object by version number 356 357 Example: seq = db.get_Seq_by_ver('X77802.1') 358 359 """ 360 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name) 361 return BioSeq.DBSeqRecord(self.adaptor, seqid)
362
363 - def get_Seqs_by_acc(self, name):
364 """Gets a *list* of Bio::Seq objects by accession number 365 366 Example: seqs = db.get_Seq_by_acc('X77802') 367 368 """ 369 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name) 370 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
371
372 - def get_PrimarySeq_stream(self):
373 # my @array = $self->get_all_primary_ids; 374 # my $stream = Bio::DB::BioDatabasePSeqStream->new( 375 # -adaptor => $self->_adaptor->db->get_PrimarySeqAdaptor, 376 # -idlist => \@array); 377 raise NotImplementedError("waiting for Python 2.2's iter")
378
379 - def get_all_primary_ids(self):
380 """Array of all the primary_ids of the sequences in the database. 381 382 These maybe ids (display style) or accession numbers or 383 something else completely different - they *are not* 384 meaningful outside of this database implementation. 385 """ 386 return self.adaptor.list_bioentry_ids(self.dbid)
387
388 - def __getitem__(self, key):
389 return BioSeq.DBSeqRecord(self.adaptor, key)
390 - def keys(self):
391 return self.get_all_primary_ids()
392 - def values(self):
393 return [self[key] for key in self.keys()]
394 - def items(self):
395 return [(key, self[key]) for key in self.keys()]
396
397 - def lookup(self, **kwargs):
398 if len(kwargs) != 1: 399 raise TypeError("single key/value parameter expected") 400 k, v = kwargs.items()[0] 401 if k not in _allowed_lookups: 402 raise TypeError("lookup() expects one of %s, not %r" % \ 403 (repr(_allowed_lookups.keys())[1:-1], repr(k))) 404 lookup_name = _allowed_lookups[k] 405 lookup_func = getattr(self.adaptor, lookup_name) 406 seqid = lookup_func(self.dbid, v) 407 return BioSeq.DBSeqRecord(self.adaptor, seqid)
408
409 - def get_Seq_by_primary_id(self, seqid):
410 """Gets a Bio::Seq object by the primary (internal) id. 411 412 The primary id in these cases has to come from 413 $db->get_all_primary_ids. There is no other way to get (or 414 guess) the primary_ids in a database. 415 """ 416 return self[seqid]
417
418 - def load(self, record_iterator, fetch_NCBI_taxonomy=False):
419 """Load a set of SeqRecords into the BioSQL database. 420 421 record_iterator is either a list of SeqRecord objects, or an 422 Iterator object that returns SeqRecord objects (such as the 423 output from the Bio.SeqIO.parse() function), which will be 424 used to populate the database. 425 426 fetch_NCBI_taxonomy is boolean flag allowing or preventing 427 connection to the taxonomic database on the NCBI server 428 (via Bio.Entrez) to fetch a detailed taxonomy for each 429 SeqRecord. 430 431 Example: 432 from Bio import SeqIO 433 count = db.load(SeqIO.parse(open(filename), format)) 434 435 Returns the number of records loaded. 436 """ 437 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, \ 438 fetch_NCBI_taxonomy) 439 num_records = 0 440 for cur_record in record_iterator : 441 num_records += 1 442 db_loader.load_seqrecord(cur_record) 443 return num_records
444