1
2
3
4
5
6
7
8
9 """Implements Registry to access databases. These objects access
10 databases using a dictionary-like interface, where the key is the ID
11 of the thing to look up, and the value returned is the data associated
12 with the key.
13
14 Classes:
15 DBRegistry Accesses databases with a dictionary-like interface.
16 DBObject Base class for Registry objects for databases.
17 DBGroup Groups DBObjects.
18
19 CGIDB Accesses CGI databases.
20 EUtilsDB Accesses NCBI using EUtils.
21 BioSQLDB Accesses a BioSQL database.
22 BioCorbaDB Accesses a BioCorba database.
23 IndexedFileDB Accesses a Mindy Indexed file.
24 """
25 from Bio.config.Registry import *
26
28 """This implements a dictionary-like interface to databases.
29
30 """
31 - def __init__(self, name, load_path=None):
33
34
35 db = DBRegistry("db", "Bio.dbdefs")
36
38 return abbrev.replace("-", "_")
39
41 """This is a base class for dictionary-like interfaces to
42 databases.
43
44 Methods:
45 get Lookup a key in a database, with a default value.
46 get_as Lookup a key and convert to an object.
47 __getitem__ Lookup a key in a database.
48
49 THE FOLLOWING SHOULD BE IMPLEMENTED IN A DERIVED CLASS.
50 _get Return the data indicated by key.
51 _convert_to Convert the data to another object.
52 IMPLEMENT THESE ONLY IF TIMEOUT OR CONCURRENT ACCESS IS NEEDED.
53 _make_pickleable Make the object returned by _get to a pickleable.
54 _unmake_pickleable Turn the pickleable object back into the original
55
56 """
57 - def __init__(self, name, abbrev=None, doc=None, delay=None):
65
66 - def set(self, key, data):
68
69 - def get(self, key, default=None):
70 """S.get(key[, default]) -> data"""
71 try:
72 results = self[key]
73 except KeyError:
74 results = default
75 return results
76
77 - def get_as(self, key, to_io=None, default=None):
81
83 try:
84 return self._get(key)
85 except IOError, x:
86 if str(x) == "timed out":
87 raise KeyError, x
88 raise
89
90
91
92 - def _get(self, key):
93 """S._get(key) -> data"""
94
95 raise NotImplementedError, "Please implement in a derived class."
97 """S._convert_to(data, to_io) -> another data type"""
98
99
100 - def _set(self, key, data):
101 """S._set(key, data)"""
102
103 raise NotImplementedError, "Caching not supported here."
105 """S._make_pickleable(key, data) -> pickleable_obj"""
106
107
108
109 raise NotImplementedError, "pickling not supported."
111 """S._unmake_pickleable(key, pickleable_obj) -> data"""
112
113
114
115 raise NotImplementedError, "pickling not supported."
116
118 """Groups DBObjects that return the same kind of data.
119
120 """
121 - def __init__(self, name, abbrev=None, doc=None, cache=None):
122 """DBGroup(name[, abbrev][, doc])
123
124 name is the name of the object, and abbrev is an abbreviation
125 for the name.
126 """
127 abbrev = _clean_abbrev(abbrev or name)
128 RegisterableGroup.__init__(self, name, abbrev, doc)
129 self._last_object_used = None
130
132 for obj in self.objs:
133 try:
134 handle = obj[key]
135 except SystemError, KeyboardInterrupt:
136 raise
137 except Exception, x:
138 continue
139 else:
140 self._last_object_used = obj
141 return handle
142 raise KeyError, "I could not get any results."
143
144 - def get(self, key, default=None):
150
151 - def get_as(self, key, to_io=None, default=None):
155
157 """Mixin class with useful functionality for retrival of text files.
158
159 This implements some useful helper functions and overrides of DBObject
160 for those implementations which need to retrieve text, check for errors in
161 the retrieve text, and then convert that text to other formats.
162 """
163 - def _check_for_errors(self, handle, failure_cases):
164 from Martel import Parser
165 from Bio import StdHandler
166 from Bio.EUtils.ReseekFile import ReseekFile
167
168 if not failure_cases:
169 return handle
170 handle = ReseekFile(handle)
171 pos = handle.tell()
172 for expression, errormsg in failure_cases:
173 handle.seek(pos)
174 parser = expression.make_parser()
175 handler = StdHandler.RecognizeHandler()
176 parser.setContentHandler(handler)
177 parser.setErrorHandler(handler)
178 try:
179 parser.parseFile(handle)
180 except Parser.ParserException:
181 pass
182 if handler.recognized:
183 raise KeyError, errormsg
184 handle.seek(pos)
185 return handle
186
187 - def _convert_to(self, handle, to_io):
188 from Bio import FormatIO
189 x = to_io.read(handle)
190 if isinstance(x, FormatIO.FormatIOIterator):
191 i = 0
192 for rec in x:
193 if i > 0:
194 raise AssertionError, "Multiple records returned"
195 i += 1
196 else:
197 rec = x
198 return rec
199
200 -class CGIDB(DBObject, TextLikeMixin):
201 """This class implements DBObject for accessing CGI databases.
202
203 """
204 - def __init__(self, name, cgi, url=None, key=None, params=None,
205 abbrev=None, doc=None, delay=None, timeout=None,
206 getmethod=1, failure_cases=None):
207 """CGIDB(name, cgi[, url][, key][, params][, abbrev][, doc]
208 [, delay][, timeout][, getmethod][, failure_cases])
209
210 name is the name of the object, abbrev is an abbreviation for
211 the name, and doc is some documentation describing the object.
212
213 cgi is the URL for the cgi script. url points to the
214 human-readable URL of the form.
215
216 params is a list of (key, value) tuples indicating the
217 parameters that should be passed to the CGI script. key is
218 the name of the parameter for the CGI script whose value is
219 the ID of the object to retrieve.
220
221 getmethod is a boolean describing whether a GET or POST should
222 be used. By default, GET is used.
223
224 failure_cases is a list of (Martel Expression, error message)
225 describing patterns of errors in the text returned by the
226 script.
227
228 """
229 import _support
230 DBObject.__init__(self, name=name, abbrev=abbrev,
231 doc=doc, delay=delay, timeout=timeout)
232 self.cgi = cgi
233 self.key = key or ''
234 self.params = params or []
235 self.url = url
236 self.getmethod = getmethod
237 self.failure_cases = []
238 for exp, message in failure_cases or []:
239 exp = _support.make_cached_expression(exp)
240 self.failure_cases.append((exp, message))
241
243 return self.params + [(self.key, key)]
244
245 - def _get(self, key):
249
251 import urllib
252 params = self._normalize_params(key)
253 options = _my_urlencode(params)
254 if self.getmethod:
255 fullcgi = self.cgi
256 if options:
257 fullcgi = "%s?%s" % (self.cgi, options)
258 handle = urllib.urlopen(fullcgi)
259 else:
260 handle = urllib.urlopen(self.cgi, options)
261 return handle
262
265
267 import StringIO
268 return StringIO.StringIO(obj)
269
270 -class EUtilsDB(DBObject, TextLikeMixin):
271 """Implement DBObject for accessing EUtils databases at NCBI.
272 """
273 - def __init__(self, name, db, rettype, abbrev = None, doc = None,
274 failure_cases = None, delay = None, timeout = None):
275 """Initialize an EUtilsDB connection for retrieval.
276
277 name is the name of the object, abbrev is an abbreviation for
278 the name, and doc is some documentation describing the object.
279
280 db is the name of the database at NCBI you want to retrieve from
281 (ie. protein, nucleotide, pubmed)
282
283 rettype is the type of information to return
284 (ie. gp, gb, fasta, medline)
285
286 failure_cases is a list of (Martel Expression, error message)
287 describing patterns of errors in the text returned by the
288 script.
289 """
290 import _support
291 DBObject.__init__(self, name=name, abbrev=abbrev,
292 doc=doc, delay=delay, timeout=timeout)
293 self.db = db
294 self.rettype = rettype
295 self.failure_cases = []
296 for exp, message in failure_cases or []:
297 exp = _support.make_cached_expression(exp)
298 self.failure_cases.append((exp, message))
299
300 - def _get(self, key):
311
313 """Represent a BioSQL-style database to retrieve SeqRecord objects.
314
315 This returns a SeqRecord-like object from _get() instead of a
316 handle (since BioSQL is not going to give you a handle).
317
318 """
319 - def __init__(self, name, doc = "", db_host = 'localhost', db_port = '',
320 db_user = 'root', db_passwd = '', sql_db = '',
321 namespace_db = '', db_type = 'mysql'):
322 """Intialize with information for connecting to the BioSQL db.
323 """
324 DBObject.__init__(self, name=name, doc=doc)
325 self.db_host = db_host
326 self.db_port = db_port
327 self.db_user = db_user
328 self.db_passwd = db_passwd
329 self.sql_db = sql_db
330 self.namespace_db = namespace_db
331 self.db_type = db_type
332
334 """Retrieve the appropriate module to use for connecting to a database
335
336 This parses a description of the database and tries to determine
337 which module is appropriate for that database type.
338 """
339 if db_type in ['mysql']:
340 return 'MySQLdb'
341 elif db_type in ['pg', 'postgres', 'postgresql']:
342 raise ValueError("Postgres not supported yet. Sorry.")
343 else:
344 raise ValueError("Unknown database type: %s" % db_type)
345
346 - def _get(self, key):
347
348 from BioSQL import BioSeqDatabase
349
350
351
352
353
354 find_id = key
355
356 db_driver = self._get_db_module(self.db_type)
357 open_args = {"user" : self.db_user,
358 "passwd" : self.db_passwd,
359 "host" : self.db_host,
360 "db" : self.sql_db,
361 "driver" : db_driver}
362 if self.db_port:
363 open_args["port"] = self.db_port
364 server = BioSeqDatabase.open_database( *(), **open_args)
365 db = server[self.namespace_db]
366
367 item = None
368 for possible_id_type in ["accession", "display_id"]:
369 try:
370 item = db.lookup( *(), **{possible_id_type : find_id})
371 except IndexError:
372 pass
373 if item is None:
374 raise KeyError("Could not get item with id: %s" % find_id)
375 return item
376
382
387
389 """Represent a BioCorba BioSequenceCollection for SeqRecord objects.
390
391 Returns SeqRecord-like objects.
392
393 """
394 - def __init__(self, name, ior_ref, server_type=None, doc=""):
395 """Intialize with IOR reference for a BioCorba Collection.
396
397 ior_ref is a URL or file reference to an IOR string. The IOR
398 should reference a BioSequenceCollection. This is the top level
399 BioCorba object we should use for making objects available.
400
401 server_type is a hack parameter which might be necessary if there
402 are server/client issues (ie. as with Perl ORBit) that we need
403 to muck around with. If not set, we just use a standard retriever.
404 """
405 DBObject.__init__(self, name=name, doc=doc)
406 self.retriever = self._get_retriever(server_type)
407 self.ior_ref = ior_ref
408 self.corba_dict = None
409
411 """Return a BioCorba retriever object based on the specified server.
412
413 This returns a ready-to-go client retriever which can be used to
414 connect to a BioCorba server.
415 """
416
417
418 from BioCorba.Client.BiocorbaConnect import PerlCorbaClient, \
419 PythonCorbaClient, JavaCorbaClient, GenericCorbaClient
420 from BioCorba.Client.Seqcore.CorbaCollection import \
421 BioSequenceCollection
422
423 if server_type is None:
424 client_type = GenericCorbaClient
425 else:
426 server_type = server_type.lower()
427 if server_type.find("python") >= 0:
428 client_type = PythonCorbaClient
429 elif server_type.find("java") >= 0:
430 client_type = JavaCorbaClient
431 elif server_type.find("perl") >= 0:
432 client_type = PerlCorbaClient
433 else:
434 raise ValueError("Unexpected server type specified: %s" %
435 server_type)
436
437 retriever = client_type(BioSequenceCollection)
438 return retriever
439
441 """Get a connection to the CORBA server based on the ior_ref
442 """
443
444 from BioCorba.Bio import GenBank
445
446 if ior_ref.find("http") >= 0:
447 client = retriever.from_url_ior(ior_ref)
448 else:
449 client = retriever.from_file_ior(ior_ref)
450
451 return GenBank.Dictionary(client, GenBank.FeatureParser())
452
453 - def _get(self, key):
454
455 if self.corba_dict is None:
456 self.corba_dict = self._get_corba_client(self.ior_ref,
457 self.retriever)
458 return self.corba_dict[key]
459
465
467 """Return SeqRecord objects from an indexed file.
468
469 This module deals with both flat file and BerkeleyDB indexes.
470 These indexed files can be created by any of the compliant indexing
471 implementations from Biopython, BioPerl, BioJava, etc...
472
473 """
474 - def __init__(self, name, dbname, doc = ""):
475 """Intialize with information about loading the database.
476
477 dbname is the name of the database to open. This will likely
478 be a filesystem path to a database directory.
479 """
480 DBObject.__init__(self, name=name, doc=doc)
481 self.db = self._load_database(dbname)
482
484 """Get a connection with the given database.
485 """
486 from Bio import Mindy
487 db = Mindy.open(dbname = name)
488 return db
489
491 """Get a list of all namespaces to search for the file under.
492
493 If given_name is a valid key, then it is returned as the only
494 thing to check. Otherwise, we go forward and check all possible
495 namespaces.
496 """
497 if given_name is not None and given_name in db.keys():
498 return [given_name]
499 else:
500 return db.keys()
501
502 - def _get(self, key):
503 """Do the database retrieval of the sequence, returning a handle.
504 """
505
506
507 import operator
508 import StringIO
509 if not operator.isSequenceType(key) or len(key) != 2:
510 raise ValueError, "Key should be tuple of (namespace, key)"
511 namespace, key = key
512 names_to_check = self._get_check_names(namespace, self.db)
513 for check_name in names_to_check:
514 location = self.db.lookup( *(), **{check_name : key})
515 if len(location) >= 1:
516 break
517 assert len(location) == 1, "Got multiple hits: %s" % location
518 return StringIO(location[0].text)
519
521 from Bio import FormatIO
522 x = to_io.read(handle)
523 if isinstance(x, FormatIO.FormatIOIterator):
524 i = 0
525 for rec in x:
526 if i > 0:
527 raise AssertionError, "Multiple records returned"
528 i += 1
529 else:
530 rec = x
531 return rec
532
534
535
536
537
538
539
540
541 import operator
542 import urllib
543
544 if operator.isMappingType(params) and hasattr(params, "items"):
545 params = params.items()
546
547 paramlist = []
548 for key, value in params:
549 if key:
550 paramlist.append(urllib.urlencode([(key, value)]))
551 else:
552 paramlist.append(urllib.quote_plus(value))
553 return '&'.join(paramlist)
554