Package Bio :: Package SCOP
[hide private]
[frames] | no frames]

Source Code for Package Bio.SCOP

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # Modifications Copyright 2004/2005 James Casbon. All rights Reserved. 
  3  # 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Changes made by James Casbon: 
  9  # - New Astral class 
 10  # - SQL functionality for both Scop and Astral classes 
 11  # - All sunids are int not strings 
 12  # 
 13  # Code written by Jeffrey Chang to access SCOP over the internet, which 
 14  # was previously in Bio.WWW.SCOP, has now been merged into this module. 
 15   
 16  """ SCOP: Structural Classification of Proteins. 
 17   
 18  The SCOP database aims to provide a manually constructed classification of 
 19  all know protein structures into a hierarchy, the main levels of which 
 20  are family, superfamily and fold. 
 21   
 22  * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/ 
 23   
 24  * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html 
 25   
 26  * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/ 
 27   
 28  The Scop object in this module represents the entire SCOP classification. It 
 29  can be built from the three SCOP parsable files, modified is so desired, and 
 30  converted back to the same file formats. A single SCOP domain (represented 
 31  by the Domain class) can be obtained from Scop using the domain's SCOP 
 32  identifier (sid). 
 33   
 34   
 35  nodeCodeDict  -- A mapping between known 2 letter node codes and a longer 
 36                    description. The known node types are 'cl' (class), 'cf' 
 37                    (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain),  
 38                    'sp' (species), 'px' (domain). Additional node types may 
 39                    be added in the future. 
 40   
 41  This module also provides code to access SCOP over the WWW. 
 42   
 43  Functions: 
 44  search        -- Access the main CGI script. 
 45  _open         -- Internally used function. 
 46   
 47  """ 
 48   
 49  from types import * 
 50  import os 
 51   
 52  import Des 
 53  import Cla 
 54  import Hie 
 55  from Residues import * 
 56  from Bio import SeqIO 
 57  from Bio.Seq import Seq 
 58   
 59   
 60  nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily', 
 61                   'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'} 
 62   
 63   
 64  _nodetype_to_code= { 'class': 'cl', 'fold': 'cf', 'superfamily': 'sf', 
 65                       'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'} 
 66   
 67  nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ]  
 68   
 69  astralBibIds = [10,20,25,30,35,40,50,70,90,95,100] 
 70   
 71  astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15, 
 72               1e-20, 1e-25, 1e-50] 
 73   
 74  astralEv_to_file = { 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', 
 75                       0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', 
 76                       1e-4: 'e-4',  1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', 
 77                       1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' } 
 78   
 79  astralEv_to_sql = { 10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1', 
 80                       0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3', 
 81                       1e-4: 'e_4',  1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15', 
 82                       1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50' } 
 83   
 84   
85 -def cmp_sccs(sccs1, sccs2) :
86 """Order SCOP concise classification strings (sccs). 87 88 a.4.5.1 < a.4.5.11 < b.1.1.1 89 90 A sccs (e.g. a.4.5.11) compactly represents a domain's classification. 91 The letter represents the class, and the numbers are the fold, 92 superfamily, and family, respectively. 93 94 """ 95 96 s1 = sccs1.split(".") 97 s2 = sccs2.split(".") 98 99 if s1[0] != s2[0]: return cmp(s1[0], s2[0]) 100 101 s1 = map(int, s1[1:]) 102 s2 = map(int, s2[1:]) 103 104 return cmp(s1,s2)
105 106 107 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") 108
109 -def parse_domain(str) :
110 """Convert an ASTRAL header string into a Scop domain. 111 112 An ASTRAL (http://astral.stanford.edu/) header contains a concise 113 description of a SCOP domain. A very similar format is used when a 114 Domain object is converted into a string. The Domain returned by this 115 method contains most of the SCOP information, but it will not be located 116 within the SCOP hierarchy (i.e. The parent node will be None). The 117 description is composed of the SCOP protein and species descriptions. 118 119 A typical ASTRAL header looks like -- 120 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} 121 """ 122 123 m = _domain_re.match(str) 124 if (not m) : raise ValueError("Domain: "+ str) 125 126 dom = Domain() 127 dom.sid = m.group(1) 128 dom.sccs = m.group(2) 129 dom.residues = Residues(m.group(3)) 130 if not dom.residues.pdbid : 131 dom.residues.pdbid= dom.sid[1:5] 132 dom.description = m.group(4).strip() 133 134 return dom
135
136 -def _open_scop_file(scop_dir_path, version, filetype) :
137 filename = "dir.%s.scop.txt_%s" % (filetype,version) 138 handle = open(os.path.join( scop_dir_path, filename)) 139 return handle
140 141
142 -class Scop:
143 """The entire SCOP hierarchy. 144 145 root -- The root node of the hierarchy 146 """
147 - def __init__(self, cla_handle=None, des_handle=None, hie_handle=None, 148 dir_path=None, db_handle=None, version=None):
149 """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend. 150 151 If no file handles are given, then a Scop object with a single 152 empty root node is returned. 153 154 If a directory and version are given (with dir_path=.., version=...) or 155 file handles for each file, the whole scop tree will be built in memory. 156 157 If a MySQLdb database handle is given, the tree will be built as needed, 158 minimising construction times. To build the SQL database to the methods 159 write_xxx_sql to create the tables. 160 161 """ 162 self._sidDict = {} 163 self._sunidDict = {} 164 165 if cla_handle==des_handle==hie_handle==dir_path==db_handle==None: return 166 167 if dir_path is None and db_handle is None: 168 if cla_handle == None or des_handle==None or hie_handle==None: 169 raise RuntimeError("Need CLA, DES and HIE files to build SCOP") 170 171 sunidDict = {} 172 173 self.db_handle = db_handle 174 try: 175 176 if db_handle: 177 # do nothing if we have a db handle, we'll do it all on the fly 178 pass 179 180 else: 181 # open SCOP parseable files 182 if dir_path: 183 if not version : 184 raise RuntimeError("Need SCOP version to find parsable files in directory") 185 if cla_handle or des_handle or hie_handle: 186 raise RuntimeError("Cannot specify SCOP directory and specific files") 187 188 cla_handle = _open_scop_file( dir_path, version, 'cla') 189 des_handle = _open_scop_file( dir_path, version, 'des') 190 hie_handle = _open_scop_file( dir_path, version, 'hie') 191 192 root = Node() 193 domains = [] 194 root.sunid=0 195 root.type='ro' 196 sunidDict[root.sunid] = root 197 self.root = root 198 root.description = 'SCOP Root' 199 200 # Build the rest of the nodes using the DES file 201 records = Des.parse(des_handle) 202 for record in records: 203 if record.nodetype =='px' : 204 n = Domain() 205 n.sid = record.name 206 domains.append(n) 207 else : 208 n = Node() 209 n.sunid = record.sunid 210 n.type = record.nodetype 211 n.sccs = record.sccs 212 n.description = record.description 213 214 sunidDict[n.sunid] = n 215 216 # Glue all of the Nodes together using the HIE file 217 records = Hie.parse(hie_handle) 218 for record in records: 219 if record.sunid not in sunidDict : 220 print record.sunid 221 222 n = sunidDict[record.sunid] 223 224 if record.parent != '' : # Not root node 225 226 if record.parent not in sunidDict: 227 raise ValueError("Incomplete data?") 228 229 n.parent = sunidDict[record.parent] 230 231 for c in record.children: 232 if c not in sunidDict : 233 raise ValueError("Incomplete data?") 234 n.children.append(sunidDict[c]) 235 236 237 # Fill in the gaps with information from the CLA file 238 sidDict = {} 239 records = Cla.parse(cla_handle) 240 for record in records: 241 n = sunidDict[record.sunid] 242 assert n.sccs == record.sccs 243 assert n.sid == record.sid 244 n.residues = record.residues 245 sidDict[n.sid] = n 246 247 # Clean up 248 self._sunidDict = sunidDict 249 self._sidDict = sidDict 250 self._domains = tuple(domains) 251 252 finally: 253 if dir_path : 254 # If we opened the files, we close the files 255 if cla_handle : cla_handle.close() 256 if des_handle : des_handle.close() 257 if hie_handle : hie_handle.close()
258 259
260 - def getRoot(self):
261 return self.getNodeBySunid(0)
262 263
264 - def getDomainBySid(self, sid) :
265 """Return a domain from its sid""" 266 if sid in self._sidDict: 267 return self._sidDict[sid] 268 if self.db_handle: 269 self.getDomainFromSQL(sid=sid) 270 if sid in self._sidDict: 271 return self._sidDict[sid] 272 else: 273 return None
274 275
276 - def getNodeBySunid(self, sunid) :
277 """Return a node from its sunid""" 278 if sunid in self._sunidDict: 279 return self._sunidDict[sunid] 280 if self.db_handle: 281 self.getDomainFromSQL(sunid=sunid) 282 if sunid in self._sunidDict: 283 return self._sunidDict[sunid] 284 else: 285 return None
286 287
288 - def getDomains(self) :
289 """Returns an ordered tuple of all SCOP Domains""" 290 if self.db_handle: 291 return self.getRoot().getDescendents('px') 292 else: 293 return self._domains
294 295 296
297 - def write_hie(self, handle) :
298 """Build an HIE SCOP parsable file from this object""" 299 nodes = self._sunidDict.values() 300 # We order nodes to ease comparison with original file 301 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid)) 302 303 for n in nodes : 304 handle.write(str(n.toHieRecord()))
305 306
307 - def write_des(self, handle) :
308 """Build a DES SCOP parsable file from this object""" 309 nodes = self._sunidDict.values() 310 # Origional SCOP file is not ordered? 311 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid)) 312 313 for n in nodes : 314 if n != self.root : 315 handle.write(str(n.toDesRecord()))
316 317
318 - def write_cla(self, handle) :
319 """Build a CLA SCOP parsable file from this object""" 320 nodes = self._sidDict.values() 321 # We order nodes to ease comparison with original file 322 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid)) 323 324 for n in nodes : 325 handle.write(str(n.toClaRecord()))
326 327
328 - def getDomainFromSQL(self, sunid=None, sid=None):
329 """Load a node from the SQL backend using sunid or sid""" 330 if sunid==sid==None: return None 331 332 cur = self.db_handle.cursor() 333 334 if sid: 335 cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid) 336 res = cur.fetchone() 337 if res is None: 338 return None 339 sunid = res[0] 340 341 cur.execute("SELECT * FROM des WHERE sunid=%s", sunid) 342 data = cur.fetchone() 343 344 if data is not None: 345 n = None 346 347 #determine if Node or Domain 348 if data[1] != "px": 349 n = Node(scop=self) 350 351 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid) 352 children = [] 353 for c in cur.fetchall(): 354 children.append(c[0]) 355 n.children = children 356 357 358 else: 359 n = Domain(scop=self) 360 cur.execute("select sid, residues, pdbid from cla where sunid=%s", 361 sunid) 362 363 [n.sid,n.residues,pdbid] = cur.fetchone() 364 n.residues = Residues(n.residues) 365 n.residues.pdbid=pdbid 366 self._sidDict[n.sid] = n 367 368 [n.sunid,n.type,n.sccs,n.description] = data 369 370 if data[1] != 'ro': 371 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid) 372 n.parent = cur.fetchone()[0] 373 374 n.sunid = int(n.sunid) 375 376 self._sunidDict[n.sunid] = n
377 378
379 - def getAscendentFromSQL(self, node, type):
380 """Get ascendents using SQL backend""" 381 if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): return None 382 383 cur = self.db_handle.cursor() 384 cur.execute("SELECT "+type+" from cla WHERE "+node.type+"=%s", (node.sunid)) 385 result = cur.fetchone() 386 if result is not None: 387 return self.getNodeBySunid(result[0]) 388 else: 389 return None
390 391
392 - def getDescendentsFromSQL(self, node, type):
393 """Get descendents of a node using the database backend. This avoids 394 repeated iteration of SQL calls and is therefore much quicker than 395 repeatedly calling node.getChildren(). 396 """ 397 if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): return [] 398 399 des_list = [] 400 401 402 # SQL cla table knows nothing about 'ro' 403 if node.type == 'ro': 404 for c in node.getChildren(): 405 for d in self.getDescendentsFromSQL(c,type): 406 des_list.append(d) 407 return des_list 408 409 cur = self.db_handle.cursor() 410 411 412 if type != 'px': 413 cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \ 414 cla,des WHERE cla."+node.type+"=%s AND cla."+type+"=des.sunid", (node.sunid)) 415 data = cur.fetchall() 416 for d in data: 417 if int(d[0]) not in self._sunidDict: 418 n = Node(scop=self) 419 [n.sunid,n.type,n.sccs,n.description] = d 420 n.sunid=int(n.sunid) 421 self._sunidDict[n.sunid] = n 422 423 cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid) 424 n.parent = cur.fetchone()[0] 425 426 cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid) 427 children = [] 428 for c in cur.fetchall(): 429 children.append(c[0]) 430 n.children = children 431 432 433 des_list.append( self._sunidDict[int(d[0])] ) 434 435 else: 436 cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\ 437 FROM cla,des where cla.sunid=des.sunid and cla."+node.type+"=%s", 438 node.sunid) 439 440 data = cur.fetchall() 441 for d in data: 442 if int(d[0]) not in self._sunidDict: 443 n = Domain(scop=self) 444 #[n.sunid, n.sid, n.pdbid, n.residues, n.sccs, n.type, 445 #n.description,n.parent] = data 446 [n.sunid,n.sid, pdbid,n.residues,n.sccs,n.type,n.description, 447 n.parent] = d[0:8] 448 n.residues = Residues(n.residues) 449 n.residues.pdbid = pdbid 450 n.sunid = int(n.sunid) 451 self._sunidDict[n.sunid] = n 452 self._sidDict[n.sid] = n 453 454 455 des_list.append( self._sunidDict[int(d[0])] ) 456 457 return des_list
458 459 460 461
462 - def write_hie_sql(self, handle):
463 """Write HIE data to SQL database""" 464 cur = handle.cursor() 465 466 cur.execute("DROP TABLE IF EXISTS hie") 467 cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\ 468 INDEX (parent) )") 469 470 for p in self._sunidDict.values(): 471 for c in p.children: 472 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
473 474
475 - def write_cla_sql(self, handle):
476 """Write CLA data to SQL database""" 477 cur = handle.cursor() 478 479 cur.execute("DROP TABLE IF EXISTS cla") 480 cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\ 481 residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\ 482 dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )") 483 484 for n in self._sidDict.values(): 485 c = n.toClaRecord() 486 cur.execute( "INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", 487 (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs, 488 n.getAscendent('cl').sunid, n.getAscendent('cf').sunid, 489 n.getAscendent('sf').sunid, n.getAscendent('fa').sunid, 490 n.getAscendent('dm').sunid, n.getAscendent('sp').sunid, 491 n.sunid ))
492 493
494 - def write_des_sql(self, handle):
495 """Write DES data to SQL database""" 496 cur = handle.cursor() 497 498 cur.execute("DROP TABLE IF EXISTS des") 499 cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\ 500 description VARCHAR(255),\ 501 PRIMARY KEY (sunid) )") 502 503 for n in self._sunidDict.values(): 504 cur.execute( "INSERT INTO des VALUES (%s,%s,%s,%s)", 505 ( n.sunid, n.type, n.sccs, n.description ) )
506 507 508
509 -class Node :
510 """ A node in the Scop hierarchy 511 512 sunid -- SCOP unique identifiers. e.g. '14986' 513 514 parent -- The parent node 515 516 children -- A list of child nodes 517 518 sccs -- SCOP concise classification string. e.g. 'a.1.1.2' 519 520 type -- A 2 letter node type code. e.g. 'px' for domains 521 522 description -- 523 524 """
525 - def __init__(self, scop=None) :
526 """Create a Node in the scop hierarchy. If a Scop instance is provided to the 527 constructor, this will be used to lookup related references using the SQL 528 methods. If no instance is provided, it is assumed the whole tree exists 529 and is connected.""" 530 self.sunid='' 531 self.parent = None 532 self.children=[] 533 self.sccs = '' 534 self.type ='' 535 self.description ='' 536 self.scop=scop
537
538 - def __str__(self) :
539 s = [] 540 s.append(str(self.sunid)) 541 s.append(self.sccs) 542 s.append(self.type) 543 s.append(self.description) 544 545 return " ".join(s)
546
547 - def toHieRecord(self):
548 """Return an Hie.Record""" 549 rec = Hie.Record() 550 rec.sunid = str(self.sunid) 551 if self.getParent() : #Not root node 552 rec.parent = str(self.getParent().sunid) 553 else: 554 rec.parent = '-' 555 for c in self.getChildren() : 556 rec.children.append(str(c.sunid)) 557 return rec
558
559 - def toDesRecord(self):
560 """Return a Des.Record""" 561 rec = Des.Record() 562 rec.sunid = str(self.sunid) 563 rec.nodetype = self.type 564 rec.sccs = self.sccs 565 rec.description = self.description 566 return rec
567
568 - def getChildren(self):
569 """Return a list of children of this Node""" 570 if self.scop is None: 571 return self.children 572 else: 573 return map ( self.scop.getNodeBySunid, self.children )
574
575 - def getParent(self):
576 """Return the parent of this Node""" 577 if self.scop is None: 578 return self.parent 579 else: 580 return self.scop.getNodeBySunid( self.parent )
581
582 - def getDescendents( self, node_type) :
583 """ Return a list of all decendent nodes of the given type. Node type can a 584 two letter code or longer description. e.g. 'fa' or 'family' 585 """ 586 if node_type in _nodetype_to_code: 587 node_type = _nodetype_to_code[node_type] 588 589 nodes = [self] 590 if self.scop: 591 return self.scop.getDescendentsFromSQL(self,node_type) 592 while nodes[0].type != node_type: 593 if nodes[0].type == 'px' : return [] # Fell of the bottom of the hierarchy 594 child_list = [] 595 for n in nodes: 596 for child in n.getChildren(): 597 child_list.append( child ) 598 nodes = child_list 599 600 return nodes
601 602
603 - def getAscendent( self, node_type) :
604 """ Return the ancenstor node of the given type, or None.Node type can a 605 two letter code or longer description. e.g. 'fa' or 'family'""" 606 if node_type in _nodetype_to_code: 607 node_type = _nodetype_to_code[node_type] 608 609 if self.scop: 610 return self.scop.getAscendentFromSQL(self,node_type) 611 else: 612 n = self 613 if n.type == node_type: return None 614 while n.type != node_type: 615 if n.type == 'ro': return None # Fell of the top of the hierarchy 616 n = n.getParent() 617 618 return n
619 620 621
622 -class Domain(Node) :
623 """ A SCOP domain. A leaf node in the Scop hierarchy. 624 625 sid -- The SCOP domain identifier. e.g. 'd5hbib_' 626 627 residues -- A Residue object. It defines the collection 628 of PDB atoms that make up this domain. 629 """
630 - def __init__(self,scop=None) :
631 Node.__init__(self,scop=scop) 632 self.sid = '' 633 self.residues = None
634
635 - def __str__(self) :
636 s = [] 637 s.append(self.sid) 638 s.append(self.sccs) 639 s.append("("+str(self.residues)+")") 640 641 if not self.getParent() : 642 s.append(self.description) 643 else : 644 sp = self.getParent() 645 dm = sp.getParent() 646 s.append(dm.description) 647 s.append("{"+sp.description+"}") 648 649 return " ".join(s)
650
651 - def toDesRecord(self):
652 """Return a Des.Record""" 653 rec = Node.toDesRecord(self) 654 rec.name = self.sid 655 return rec
656
657 - def toClaRecord(self) :
658 """Return a Cla.Record""" 659 rec = Cla.Record() 660 rec.sid = self.sid 661 rec.residues = self.residues 662 rec.sccs = self.sccs 663 rec.sunid = self.sunid 664 665 n = self 666 while n.sunid != 0: #Not root node 667 rec.hierarchy.append( (n.type, str(n.sunid)) ) 668 n = n.getParent() 669 670 rec.hierarchy.reverse() 671 672 return rec
673
674 -class Astral:
675 """Abstraction of the ASTRAL database, which has sequences for all the SCOP domains, 676 as well as clusterings by percent id or evalue. 677 """ 678
679 - def __init__( self, dir_path=None, version=None, scop=None, ind_file=None, 680 astral_file=None, db_handle=None):
681 """ 682 Initialise the astral database. 683 684 You must provide either a directory of SCOP files: 685 686 dir_path - string, the path to location of the scopseq-x.xx directory 687 (not the directory itself), and 688 version -a version number. 689 690 or, a FASTA file: 691 692 astral_file - string, a path to a fasta file (which will be loaded in memory) 693 694 or, a MYSQL database: 695 696 db_handle - a database handle for a MYSQL database containing a table 697 'astral' with the astral data in it. This can be created 698 using writeToSQL. 699 700 Note that the ind_file argument is deprecated. 701 """ 702 if ind_file : 703 raise RuntimeError("The ind_file (index file) argument is deprecated") 704 705 if astral_file==dir_path==db_handle==None: 706 raise RuntimeError("Need either file handle, or (dir_path + "\ 707 + "version) or database handle to construct Astral") 708 if not scop: 709 raise RuntimeError("Must provide a Scop instance to construct") 710 711 self.scop = scop 712 self.db_handle = db_handle 713 714 715 if not astral_file and not db_handle: 716 if dir_path == None or version == None: 717 raise RuntimeError("must provide dir_path and version") 718 719 self.version = version 720 self.path = os.path.join( dir_path, "scopseq-%s" % version) 721 astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version 722 astral_file = os.path.join (self.path, astral_file) 723 724 if astral_file: 725 #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY 726 self.fasta_dict = SeqIO.to_dict(SeqIO.parse(open(astral_file), "fasta")) 727 728 self.astral_file = astral_file 729 self.EvDatasets = {} 730 self.EvDatahash = {} 731 self.IdDatasets = {} 732 self.IdDatahash = {}
733 734
735 - def domainsClusteredByEv(self,id):
736 """get domains clustered by evalue""" 737 if id not in self.EvDatasets: 738 if self.db_handle: 739 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id]) 740 741 else: 742 if not self.path: 743 raise RuntimeError("No scopseq directory specified") 744 745 file_prefix = "astral-scopdom-seqres-sel-gs" 746 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id] , 747 self.version) 748 filename = os.path.join(self.path,filename) 749 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename) 750 return self.EvDatasets[id]
751 752
753 - def domainsClusteredById(self,id):
754 """get domains clustered by percent id""" 755 if id not in self.IdDatasets: 756 if self.db_handle: 757 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id"+str(id)) 758 759 else: 760 if not self.path: 761 raise RuntimeError("No scopseq directory specified") 762 763 file_prefix = "astral-scopdom-seqres-sel-gs" 764 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version) 765 filename = os.path.join(self.path,filename) 766 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename) 767 return self.IdDatasets[id]
768 769
770 - def getAstralDomainsFromFile(self,filename=None,file_handle=None):
771 """Get the scop domains from a file containing a list of sids""" 772 if file_handle == filename == none: 773 raise RuntimeError("You must provide a filename or handle") 774 if not file_handle: 775 file_handle = open(filename) 776 doms = [] 777 while 1: 778 line = file_handle.readline() 779 if not line: 780 break 781 line = line.rstrip() 782 doms.append(line) 783 if filename: 784 file_handle.close() 785 786 doms = filter( lambda a: a[0]=='d', doms ) 787 doms = map( self.scop.getDomainBySid, doms ) 788 return doms
789
790 - def getAstralDomainsFromSQL(self, column):
791 """Load a set of astral domains from a column in the astral table of a MYSQL 792 database (which can be created with writeToSQL(...)""" 793 cur = self.db_handle.cursor() 794 cur.execute("SELECT sid FROM astral WHERE "+column+"=1") 795 data = cur.fetchall() 796 data = map( lambda x: self.scop.getDomainBySid(x[0]), data) 797 798 return data
799 800
801 - def getSeqBySid(self,domain):
802 """get the seq record of a given domain from its sid""" 803 if self.db_handle is None: 804 return self.fasta_dict[domain].seq 805 806 else: 807 cur = self.db_handle.cursor() 808 cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) 809 return Seq(cur.fetchone()[0])
810
811 - def getSeq(self,domain):
812 """Return seq associated with domain""" 813 return self.getSeqBySid(domain.sid)
814 815
816 - def hashedDomainsById(self,id):
817 """Get domains clustered by sequence identity in a dict""" 818 if id not in self.IdDatahash: 819 self.IdDatahash[id] = {} 820 for d in self.domainsClusteredById(id): 821 self.IdDatahash[id][d] = 1 822 return self.IdDatahash[id]
823
824 - def hashedDomainsByEv(self,id):
825 """Get domains clustered by evalue in a dict""" 826 if id not in self.EvDatahash: 827 self.EvDatahash[id] = {} 828 for d in self.domainsClusteredByEv(id): 829 self.EvDatahash[id][d] = 1 830 return self.EvDatahash[id]
831 832
833 - def isDomainInId(self,dom,id):
834 """Returns true if the domain is in the astral clusters for percent ID""" 835 return dom in self.hashedDomainsById(id)
836
837 - def isDomainInEv(self,dom,id):
838 """Returns true if the domain is in the ASTRAL clusters for evalues""" 839 return dom in self.hashedDomainsByEv(id)
840 841
842 - def writeToSQL(self, db_handle):
843 """Write the ASTRAL database to a MYSQL database""" 844 cur = db_handle.cursor() 845 846 cur.execute("DROP TABLE IF EXISTS astral") 847 cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))") 848 849 for dom in self.fasta_dict.keys(): 850 cur.execute( "INSERT INTO astral (sid,seq) values (%s,%s)", 851 (dom, self.fasta_dict[dom].seq.data)) 852 853 854 for i in astralBibIds: 855 cur.execute("ALTER TABLE astral ADD (id"+str(i)+" TINYINT)") 856 857 for d in self.domainsClusteredById(i): 858 cur.execute("UPDATE astral SET id"+str(i)+"=1 WHERE sid=%s", 859 d.sid) 860 861 for ev in astralEvs: 862 cur.execute("ALTER TABLE astral ADD ("+astralEv_to_sql[ev]+" TINYINT)") 863 864 for d in self.domainsClusteredByEv(ev): 865 866 cur.execute("UPDATE astral SET "+astralEv_to_sql[ev]+"=1 WHERE sid=%s", 867 d.sid)
868
869 -def search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 870 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds):
871 """search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 872 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds) 873 874 Access search.cgi and return a handle to the results. See the 875 online help file for an explanation of the parameters: 876 http://scop.mrc-lmb.cam.ac.uk/scop/help.html 877 878 Raises an IOError if there's a network error. 879 880 """ 881 params = {'pdb' : pdb, 'key' : key, 'sid' : sid, 'disp' : disp, 882 'dir' : dir, 'loc' : loc} 883 variables = {} 884 for k in params.keys(): 885 if params[k] is not None: 886 variables[k] = params[k] 887 variables.update(keywds) 888 return _open(cgi, variables)
889
890 -def _open(cgi, params={}, get=1):
891 """_open(cgi, params={}, get=1) -> UndoHandle 892 893 Open a handle to SCOP. cgi is the URL for the cgi script to access. 894 params is a dictionary with the options to pass to it. get is a boolean 895 that describes whether a GET should be used. Does some 896 simple error checking, and will raise an IOError if it encounters one. 897 898 """ 899 import urllib 900 from Bio import File 901 # Open a handle to SCOP. 902 options = urllib.urlencode(params) 903 if get: # do a GET 904 fullcgi = cgi 905 if options: 906 fullcgi = "%s?%s" % (cgi, options) 907 handle = urllib.urlopen(fullcgi) 908 else: # do a POST 909 handle = urllib.urlopen(cgi, options) 910 911 # Wrap the handle inside an UndoHandle. 912 uhandle = File.UndoHandle(handle) 913 # Should I check for 404? timeout? etc? 914 return uhandle
915