Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15   
 16  Functions: 
 17  efetch       Retrieves records in the requested format from a list of one or 
 18               more primary IDs or from the user's environment 
 19  epost        Posts a file containing a list of primary IDs for future use in 
 20               the user's environment to use with subsequent search strategies 
 21  esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 22               and ESummary) and term translations and optionally retains 
 23               results for future use in the user's environment. 
 24  elink        Checks for the existence of an external or Related Articles link 
 25               from a list of one or more primary IDs.  Retrieves primary IDs 
 26               and relevancy scores for links to Entrez databases or Related 
 27               Articles;  creates a hyperlink to the primary LinkOut provider 
 28               for a specific ID and database, or lists LinkOut URLs 
 29               and Attributes for multiple IDs. 
 30  einfo        Provides field index term counts, last update, and available 
 31               links for each database. 
 32  esummary     Retrieves document summaries from a list of primary IDs or from 
 33               the user's environment. 
 34  egquery      Provides Entrez database counts in XML for a single search 
 35               using Global Query. 
 36  espell       Retrieves spelling suggestions. 
 37   
 38  read         Parses the XML results returned by any of the above functions. 
 39               Typical usage is: 
 40               >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 41               >>> record = Entrez.read(handle) 
 42               where record is now a Python dictionary or list. 
 43   
 44  _open        Internally used function. 
 45   
 46  """ 
 47  import urllib, time, warnings 
 48  import os.path 
 49  from Bio import File 
 50   
 51   
 52  email = None 
 53   
54 -def query(cmd, db, cgi='http://www.ncbi.nlm.nih.gov/sites/entrez', 55 **keywds):
56 """Query Entrez and return a handle to the HTML results (DEPRECATED). 57 58 See the online documentation for an explanation of the parameters: 59 http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helplinks.chapter.linkshelp 60 61 Return a handle to the results. 62 63 Raises an IOError exception if there's a network error. 64 """ 65 import warnings 66 warnings.warn("Bio.Entrez.query is deprecated, since it breaks NCBI's rule to only use the E-Utilities URL.", DeprecationWarning)
67 68 # XXX retmode?
69 -def epost(db, cgi=None, **keywds):
70 """Post a file of identifiers for future use. 71 72 Posts a file containing a list of UIs for future use in the user's 73 environment to use with subsequent search strategies. 74 75 See the online documentation for an explanation of the parameters: 76 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 77 78 Return a handle to the results. 79 80 Raises an IOError exception if there's a network error. 81 """ 82 if cgi: 83 import warnings 84 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 85 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 86 variables = {'db' : db} 87 variables.update(keywds) 88 return _open(cgi, variables)
89
90 -def efetch(db, cgi=None, **keywds):
91 """Fetches Entrez results which are returned as a handle. 92 93 EFetch retrieves records in the requested format from a list of one or 94 more UIs or from user's environment. 95 96 See the online documentation for an explanation of the parameters: 97 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 98 99 Return a handle to the results. 100 101 Raises an IOError exception if there's a network error. 102 103 Short example: 104 105 from Bio import Entrez 106 handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb") 107 print handle.read() 108 """ 109 for key in keywds : 110 if key.lower()=="rettype" and keywds[key].lower()=="genbank" : 111 import warnings 112 warnings.warn('As of Easter 2009, Entrez EFtech no longer ' 113 'supports the unofficial return type "genbank", ' 114 'use "gb" or "gp" instead.', DeprecationWarning) 115 if db.lower()=="protein" : 116 keywds[key] = "gp" #GenPept 117 else : 118 keywds[key] = "gb" #GenBank 119 if cgi: 120 import warnings 121 warnings.warn("Using a URL other than NCBI's main url for the " 122 "E-Utilities is deprecated.", DeprecationWarning) 123 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 124 variables = {'db' : db} 125 variables.update(keywds) 126 return _open(cgi, variables)
127
128 -def esearch(db, term, cgi=None, **keywds):
129 """ESearch runs an Entrez search and returns a handle to the results. 130 131 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 132 and ESummary) and term translations, and optionally retains results 133 for future use in the user's environment. 134 135 See the online documentation for an explanation of the parameters: 136 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 137 138 Return a handle to the results which are always in XML format. 139 140 Raises an IOError exception if there's a network error. 141 142 Short example: 143 144 from Bio import Entez 145 handle = Entrez.esearch(db="nucleotide", retmax=10, term="Opuntia") 146 record = Entrez.read(handle) 147 print record["Count"] 148 print record["IdList"] 149 """ 150 if cgi: 151 import warnings 152 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 153 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 154 variables = {'db' : db, 155 'term' : term} 156 variables.update(keywds) 157 return _open(cgi, variables)
158 182
183 -def einfo(cgi=None, **keywds):
184 """EInfo returns a summary of the Entez databases as a results handle. 185 186 EInfo provides field names, index term counts, last update, and 187 available links for each Entrez database. 188 189 See the online documentation for an explanation of the parameters: 190 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 191 192 Return a handle to the results, by default in XML format. 193 194 Raises an IOError exception if there's a network error. 195 196 Short example: 197 198 from Bio import Entrez 199 record = Entrez.read(Entrez.einfo()) 200 print record['DbList'] 201 """ 202 if cgi: 203 import warnings 204 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 205 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 206 variables = {} 207 variables.update(keywds) 208 return _open(cgi, variables)
209
210 -def esummary(cgi=None, **keywds):
211 """ESummary retrieves document summaries as a results handle. 212 213 ESummary retrieves document summaries from a list of primary IDs or 214 from the user's environment. 215 216 See the online documentation for an explanation of the parameters: 217 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 218 219 Return a handle to the results, by default in XML format. 220 221 Raises an IOError exception if there's a network error. 222 """ 223 if cgi: 224 import warnings 225 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 226 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 227 variables = {} 228 variables.update(keywds) 229 return _open(cgi, variables)
230
231 -def egquery(cgi=None, **keywds):
232 """EGQuery provides Entrez database counts for a global search. 233 234 EGQuery provides Entrez database counts in XML for a single search 235 using Global Query. 236 237 See the online documentation for an explanation of the parameters: 238 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 239 240 Return a handle to the results in XML format. 241 242 Raises an IOError exception if there's a network error. 243 """ 244 if cgi: 245 import warnings 246 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 247 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 248 variables = {} 249 variables.update(keywds) 250 return _open(cgi, variables)
251
252 -def espell(cgi=None, **keywds):
253 """ESpell retrieves spelling suggestions, returned in a results handle. 254 255 ESpell retrieves spelling suggestions, if available. 256 257 See the online documentation for an explanation of the parameters: 258 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 259 260 Return a handle to the results, by default in XML format. 261 262 Raises an IOError exception if there's a network error. 263 264 Short example: 265 266 from Bio import Entrez 267 record = Entrez.read(Entrez.espell(term="biopythooon")) 268 print record["Query"] 269 print record["CorrectedQuery"] 270 """ 271 if cgi: 272 import warnings 273 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 274 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 275 variables = {} 276 variables.update(keywds) 277 return _open(cgi, variables)
278
279 -def read(handle):
280 """Parses an XML file from the NCBI Entrez Utilities into python objects. 281 282 This function parses an XML file created by NCBI's Entrez Utilities, 283 returning a multilevel data structure of Python lists and dictionaries. 284 Most XML files returned by NCBI's Entrez Utilities can be parsed by 285 this function, provided its DTD is available. Biopython includes the 286 DTDs for most commonly used Entrez Utilities. 287 288 Whereas the data structure seems to consist of generic Python lists, 289 dictionaries, strings, and so on, each of these is actually a class 290 derived from the base type. This allows us to store the attributes 291 (if any) of each element in a dictionary my_element.attributes, and 292 the tag name in my_element.tag. 293 """ 294 from Parser import DataHandler 295 DTDs = os.path.join(__path__[0], "DTDs") 296 handler = DataHandler(DTDs) 297 record = handler.run(handle) 298 return record
299
300 -def _open(cgi, params={}):
301 """Helper function to build the URL and open a handle to it (PRIVATE). 302 303 Open a handle to Entrez. cgi is the URL for the cgi script to access. 304 params is a dictionary with the options to pass to it. Does some 305 simple error checking, and will raise an IOError if it encounters one. 306 307 This function also enforces the "three second rule" to avoid abusing 308 the NCBI servers. 309 """ 310 # NCBI requirement: At most three queries per second. 311 # Equivalently, at least a third of second between queries 312 delay = 0.333333334 313 current = time.time() 314 wait = _open.previous + delay - current 315 if wait > 0: 316 time.sleep(wait) 317 _open.previous = current + wait 318 else: 319 _open.previous = current 320 # Remove None values from the parameters 321 for key, value in params.items(): 322 if value is None: 323 del params[key] 324 # Tell Entrez that we are using Biopython 325 if not "tool" in params: 326 params["tool"] = "biopython" 327 # Tell Entrez who we are 328 if not "email" in params: 329 if email!=None: 330 params["email"] = email 331 # Open a handle to Entrez. 332 options = urllib.urlencode(params, doseq=True) 333 cgi += "?" + options 334 handle = urllib.urlopen(cgi) 335 336 # Wrap the handle inside an UndoHandle. 337 uhandle = File.UndoHandle(handle) 338 339 # Check for errors in the first 7 lines. 340 # This is kind of ugly. 341 lines = [] 342 for i in range(7): 343 lines.append(uhandle.readline()) 344 for i in range(6, -1, -1): 345 uhandle.saveline(lines[i]) 346 data = ''.join(lines) 347 348 if "500 Proxy Error" in data: 349 # Sometimes Entrez returns a Proxy Error instead of results 350 raise IOError("500 Proxy Error (NCBI busy?)") 351 elif "502 Proxy Error" in data: 352 raise IOError("502 Proxy Error (NCBI busy?)") 353 elif "WWW Error 500 Diagnostic" in data: 354 raise IOError("WWW Error 500 Diagnostic (NCBI busy?)") 355 elif "<title>Service unavailable!</title>" in data : 356 #Probably later in the file it will say "Error 503" 357 raise IOError("Service unavailable!") 358 elif "<title>Bad Gateway!</title>" in data : 359 #Probably later in the file it will say: 360 # "The proxy server received an invalid 361 # response from an upstream server." 362 raise IOError("Bad Gateway!") 363 elif data.startswith("Error:") : 364 #e.g. 'Error: Your session has expired. Please repeat your search.\n' 365 raise IOError(data.strip()) 366 elif data.startswith("The resource is temporarily unavailable") : 367 #This can occur with an invalid query_key 368 #Perhaps this should be a ValueError? 369 raise IOError("The resource is temporarily unavailable") 370 elif data.startswith("download dataset is empty") : 371 #This can occur when omit the identifier, or the WebEnv and query_key 372 #Perhaps this should be a ValueError? 373 raise IOError("download dataset is empty") 374 elif data[:5] == "ERROR": 375 # XXX Possible bug here, because I don't know whether this really 376 # occurs on the first line. I need to check this! 377 raise IOError("ERROR, possibly because id not available?") 378 # Should I check for 404? timeout? etc? 379 return uhandle
380 381 _open.previous = 0 382