Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15   
 16  Functions: 
 17  efetch       Retrieves records in the requested format from a list of one or 
 18               more primary IDs or from the user's environment 
 19  epost        Posts a file containing a list of primary IDs for future use in 
 20               the user's environment to use with subsequent search strategies 
 21  esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 22               and ESummary) and term translations and optionally retains 
 23               results for future use in the user's environment. 
 24  elink        Checks for the existence of an external or Related Articles link 
 25               from a list of one or more primary IDs.  Retrieves primary IDs 
 26               and relevancy scores for links to Entrez databases or Related 
 27               Articles;  creates a hyperlink to the primary LinkOut provider 
 28               for a specific ID and database, or lists LinkOut URLs 
 29               and Attributes for multiple IDs. 
 30  einfo        Provides field index term counts, last update, and available 
 31               links for each database. 
 32  esummary     Retrieves document summaries from a list of primary IDs or from 
 33               the user's environment. 
 34  egquery      Provides Entrez database counts in XML for a single search 
 35               using Global Query. 
 36  espell       Retrieves spelling suggestions. 
 37   
 38  read         Parses the XML results returned by any of the above functions. 
 39               Typical usage is: 
 40               >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 41               >>> record = Entrez.read(handle) 
 42               where record is now a Python dictionary or list. 
 43   
 44  _open        Internally used function. 
 45   
 46  """ 
 47  import urllib, time, warnings 
 48  import os.path 
 49  from Bio import File 
 50   
 51   
 52  email = None 
 53   
 54   
 55  # XXX retmode? 
56 -def epost(db, **keywds):
57 """Post a file of identifiers for future use. 58 59 Posts a file containing a list of UIs for future use in the user's 60 environment to use with subsequent search strategies. 61 62 See the online documentation for an explanation of the parameters: 63 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 64 65 Return a handle to the results. 66 67 Raises an IOError exception if there's a network error. 68 """ 69 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 70 variables = {'db' : db} 71 variables.update(keywds) 72 return _open(cgi, variables, post=True)
73
74 -def efetch(db, **keywds):
75 """Fetches Entrez results which are returned as a handle. 76 77 EFetch retrieves records in the requested format from a list of one or 78 more UIs or from user's environment. 79 80 See the online documentation for an explanation of the parameters: 81 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 82 83 Return a handle to the results. 84 85 Raises an IOError exception if there's a network error. 86 87 Short example: 88 89 from Bio import Entrez 90 handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb") 91 print handle.read() 92 """ 93 for key in keywds : 94 if key.lower()=="rettype" and keywds[key].lower()=="genbank" : 95 import warnings 96 warnings.warn('As of Easter 2009, Entrez EFetch no longer ' 97 'supports the unofficial return type "genbank", ' 98 'use "gb" or "gp" instead.', DeprecationWarning) 99 if db.lower()=="protein" : 100 keywds[key] = "gp" #GenPept 101 else : 102 keywds[key] = "gb" #GenBank 103 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 104 variables = {'db' : db} 105 variables.update(keywds) 106 return _open(cgi, variables)
107
108 -def esearch(db, term, **keywds):
109 """ESearch runs an Entrez search and returns a handle to the results. 110 111 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 112 and ESummary) and term translations, and optionally retains results 113 for future use in the user's environment. 114 115 See the online documentation for an explanation of the parameters: 116 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 117 118 Return a handle to the results which are always in XML format. 119 120 Raises an IOError exception if there's a network error. 121 122 Short example: 123 124 from Bio import Entez 125 handle = Entrez.esearch(db="nucleotide", retmax=10, term="Opuntia") 126 record = Entrez.read(handle) 127 print record["Count"] 128 print record["IdList"] 129 """ 130 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 131 variables = {'db' : db, 132 'term' : term} 133 variables.update(keywds) 134 return _open(cgi, variables)
135 156
157 -def einfo(**keywds):
158 """EInfo returns a summary of the Entez databases as a results handle. 159 160 EInfo provides field names, index term counts, last update, and 161 available links for each Entrez database. 162 163 See the online documentation for an explanation of the parameters: 164 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 165 166 Return a handle to the results, by default in XML format. 167 168 Raises an IOError exception if there's a network error. 169 170 Short example: 171 172 from Bio import Entrez 173 record = Entrez.read(Entrez.einfo()) 174 print record['DbList'] 175 """ 176 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 177 variables = {} 178 variables.update(keywds) 179 return _open(cgi, variables)
180
181 -def esummary(**keywds):
182 """ESummary retrieves document summaries as a results handle. 183 184 ESummary retrieves document summaries from a list of primary IDs or 185 from the user's environment. 186 187 See the online documentation for an explanation of the parameters: 188 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 189 190 Return a handle to the results, by default in XML format. 191 192 Raises an IOError exception if there's a network error. 193 """ 194 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 195 variables = {} 196 variables.update(keywds) 197 return _open(cgi, variables)
198
199 -def egquery(**keywds):
200 """EGQuery provides Entrez database counts for a global search. 201 202 EGQuery provides Entrez database counts in XML for a single search 203 using Global Query. 204 205 See the online documentation for an explanation of the parameters: 206 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 207 208 Return a handle to the results in XML format. 209 210 Raises an IOError exception if there's a network error. 211 """ 212 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 213 variables = {} 214 variables.update(keywds) 215 return _open(cgi, variables)
216
217 -def espell(**keywds):
218 """ESpell retrieves spelling suggestions, returned in a results handle. 219 220 ESpell retrieves spelling suggestions, if available. 221 222 See the online documentation for an explanation of the parameters: 223 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 224 225 Return a handle to the results, by default in XML format. 226 227 Raises an IOError exception if there's a network error. 228 229 Short example: 230 231 from Bio import Entrez 232 record = Entrez.read(Entrez.espell(term="biopythooon")) 233 print record["Query"] 234 print record["CorrectedQuery"] 235 """ 236 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 237 variables = {} 238 variables.update(keywds) 239 return _open(cgi, variables)
240
241 -def read(handle):
242 """Parses an XML file from the NCBI Entrez Utilities into python objects. 243 244 This function parses an XML file created by NCBI's Entrez Utilities, 245 returning a multilevel data structure of Python lists and dictionaries. 246 Most XML files returned by NCBI's Entrez Utilities can be parsed by 247 this function, provided its DTD is available. Biopython includes the 248 DTDs for most commonly used Entrez Utilities. 249 250 Whereas the data structure seems to consist of generic Python lists, 251 dictionaries, strings, and so on, each of these is actually a class 252 derived from the base type. This allows us to store the attributes 253 (if any) of each element in a dictionary my_element.attributes, and 254 the tag name in my_element.tag. 255 """ 256 from Parser import DataHandler 257 DTDs = os.path.join(__path__[0], "DTDs") 258 handler = DataHandler(DTDs) 259 record = handler.run(handle) 260 return record
261
262 -def parse(handle):
263 from Parser import DataHandler 264 DTDs = os.path.join(__path__[0], "DTDs") 265 handler = DataHandler(DTDs) 266 records = handler.parse(handle) 267 return records
268
269 -def _open(cgi, params={}, post=False):
270 """Helper function to build the URL and open a handle to it (PRIVATE). 271 272 Open a handle to Entrez. cgi is the URL for the cgi script to access. 273 params is a dictionary with the options to pass to it. Does some 274 simple error checking, and will raise an IOError if it encounters one. 275 276 This function also enforces the "up to three queries per second rule" 277 to avoid abusing the NCBI servers. 278 """ 279 # NCBI requirement: At most three queries per second. 280 # Equivalently, at least a third of second between queries 281 delay = 0.333333334 282 current = time.time() 283 wait = _open.previous + delay - current 284 if wait > 0: 285 time.sleep(wait) 286 _open.previous = current + wait 287 else: 288 _open.previous = current 289 # Remove None values from the parameters 290 for key, value in params.items(): 291 if value is None: 292 del params[key] 293 # Tell Entrez that we are using Biopython 294 if not "tool" in params: 295 params["tool"] = "biopython" 296 # Tell Entrez who we are 297 if not "email" in params: 298 if email!=None: 299 params["email"] = email 300 # Open a handle to Entrez. 301 options = urllib.urlencode(params, doseq=True) 302 if post : 303 #HTTP POST 304 handle = urllib.urlopen(cgi, data=options) 305 else : 306 #HTTP GET 307 cgi += "?" + options 308 handle = urllib.urlopen(cgi) 309 310 # Wrap the handle inside an UndoHandle. 311 uhandle = File.UndoHandle(handle) 312 313 # Check for errors in the first 7 lines. 314 # This is kind of ugly. 315 lines = [] 316 for i in range(7): 317 lines.append(uhandle.readline()) 318 for i in range(6, -1, -1): 319 uhandle.saveline(lines[i]) 320 data = ''.join(lines) 321 322 if "500 Proxy Error" in data: 323 # Sometimes Entrez returns a Proxy Error instead of results 324 raise IOError("500 Proxy Error (NCBI busy?)") 325 elif "502 Proxy Error" in data: 326 raise IOError("502 Proxy Error (NCBI busy?)") 327 elif "WWW Error 500 Diagnostic" in data: 328 raise IOError("WWW Error 500 Diagnostic (NCBI busy?)") 329 elif "<title>Service unavailable!</title>" in data : 330 #Probably later in the file it will say "Error 503" 331 raise IOError("Service unavailable!") 332 elif "<title>Bad Gateway!</title>" in data : 333 #Probably later in the file it will say: 334 # "The proxy server received an invalid 335 # response from an upstream server." 336 raise IOError("Bad Gateway!") 337 elif "<title>414 Request-URI Too Large</title>" in data \ 338 or "<h1>Request-URI Too Large</h1>" in data : 339 raise IOError("Requested URL too long (try using EPost?)") 340 elif data.startswith("Error:") : 341 #e.g. 'Error: Your session has expired. Please repeat your search.\n' 342 raise IOError(data.strip()) 343 elif data.startswith("The resource is temporarily unavailable") : 344 #This can occur with an invalid query_key 345 #Perhaps this should be a ValueError? 346 raise IOError("The resource is temporarily unavailable") 347 elif data.startswith("download dataset is empty") : 348 #This can occur when omit the identifier, or the WebEnv and query_key 349 #Perhaps this should be a ValueError? 350 raise IOError("download dataset is empty") 351 elif data[:5] == "ERROR": 352 # XXX Possible bug here, because I don't know whether this really 353 # occurs on the first line. I need to check this! 354 raise IOError("ERROR, possibly because id not available?") 355 # Should I check for 404? timeout? etc? 356 return uhandle
357 358 _open.previous = 0 359