1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36 __doc__="Access the PDB over the internet (for example to download structures)."
37
38 import urllib, re, os, sys
39
41 """
42 This class provides quick access to the structure lists on the
43 PDB server or its mirrors. The structure lists contain
44 four-letter PDB codes, indicating that structures are
45 new, have been modified or are obsolete. The lists are released
46 on a weekly basis.
47
48 It also provides a function to retrieve PDB files from the server.
49 To use it properly, prepare a directory /pdb or the like,
50 where PDB files are stored.
51
52 If You want to use this module from inside a proxy, add
53 the proxy variable to Your environment, e.g. in Unix
54 export HTTP_PROXY='http://realproxy.charite.de:888'
55 (This can also be added to ~/.bashrc)
56 """
57
58 PDB_REF="""
59 The Protein Data Bank: a computer-based archival file for macromolecular structures.
60 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
61 J. Mol. Biol. 112 pp. 535-542 (1977)
62 http://www.pdb.org/.
63 """
64
65 alternative_download_url = "http://www.rcsb.org/pdb/files/"
66
67
68
69 - def __init__(self,server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), obsolete_pdb=None):
70 """Initialize the class with the default server or a custom one."""
71
72 self.pdb_server = server
73
74
75 self.local_pdb = pdb
76
77
78 if obsolete_pdb:
79 self.obsolete_pdb = obsolete_pdb
80 else:
81 self.obsolete_pdb = self.local_pdb + os.sep + 'obsolete'
82 if not os.access(self.obsolete_pdb,os.F_OK):
83 os.makedirs(self.obsolete_pdb)
84
85
86 self.overwrite = 0
87 self.flat_tree = 0
88
89
91 """Retrieves a list of pdb codes in the weekly pdb status file
92 from the given URL. Used by get_recent_files.
93
94 Typical contents of the list files parsed by this method;
95 -rw-r--r-- 1 rcsb rcsb 330156 Oct 14 2003 pdb1cyq.ent
96 -rw-r--r-- 1 rcsb rcsb 333639 Oct 14 2003 pdb1cz0.ent
97 """
98 url = urllib.urlopen(url)
99 file = url.readlines()
100 list = []
101
102
103 list = map(lambda x: x[3:7], \
104 filter(lambda x: x[-4:] == '.ent', \
105 map(lambda x: x.split()[-1], file)))
106 return list
107
108
110 """Returns three lists of the newest weekly files (added,mod,obsolete).
111
112 Reads the directories with changed entries from the PDB server and
113 returns a tuple of three URL's to the files of new, modified and
114 obsolete entries from the most recent list. The directory with the
115 largest numerical name is used.
116 Returns None if something goes wrong.
117
118 Contents of the data/status dir (20031013 would be used);
119 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006
120 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013
121 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README
122
123
124 """
125 url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/')
126 file = url.readlines()
127
128 try:
129
130 recent = filter(lambda x: x.isdigit(), \
131 map(lambda x: x.split()[-1], file))[-1]
132
133 path = self.pdb_server+'/pub/pdb/data/status/%s/'%(recent)
134
135 added = self.get_status_list(path+'added.pdb')
136 modified = self.get_status_list(path+'modified.pdb')
137 obsolete = self.get_status_list(path+'obsolete.pdb')
138 return [added,modified,obsolete]
139 except:
140 return None
141
142
143
145 """Retrieves a big file containing all the
146 PDB entries and some annotation to them.
147 Returns a list of PDB codes in the index file.
148 """
149 entries = []
150 print "retrieving index file. Takes about 5 MB."
151 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/index/entries.idx')
152
153 entries = map(lambda x: x[:4], \
154 filter(lambda x: len(x)>4, url.readlines()[2:]))
155
156 return entries
157
158
159
161 """Returns a list of all obsolete entries ever in the PDB.
162
163 Returns a list of all obsolete pdb codes that have ever been
164 in the PDB.
165
166 Gets and parses the file from the PDB server in the format
167 (the first pdb_code column is the one used).
168 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
169 OBSLTE 30-SEP-03 1Q1D 1QZR
170 OBSLTE 26-SEP-03 1DYV 1UN2
171 """
172 url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/obsolete.dat')
173
174 obsolete = map(lambda x: x[21:25].lower(),
175 filter(lambda x: x[:6] == 'OBSLTE', url.readlines()))
176
177 return obsolete
178
179
180
181 - def retrieve_pdb_file(self,pdb_code, obsolete=0, compression='.gz',
182 uncompress="gunzip", pdir=None):
183 """Retrieves a PDB structure file from the PDB server and
184 stores it in a local file tree.
185 The PDB structure is returned as a single string.
186 If obsolete is 1, the file will be by default saved in a special file tree.
187 The compression should be '.Z' or '.gz'. 'uncompress' is
188 the command called to uncompress the files.
189
190 @param pdir: put the file in this directory (default: create a PDB-style directory tree)
191 @type pdir: string
192
193 @return: filename
194 @rtype: string
195 """
196
197 code=pdb_code.lower()
198 filename="pdb%s.ent%s"%(code,compression)
199 if not obsolete:
200 url=(self.pdb_server+
201 '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent%s'
202 % (code[1:3],code,compression))
203 else:
204 url=(self.pdb_server+
205 '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent%s'
206 % (code[1:3],code,compression))
207
208
209 if pdir is None:
210 if self.flat_tree:
211 if not obsolete:
212 path=self.local_pdb
213 else:
214 path=self.obsolete_pdb
215 else:
216
217 if not obsolete:
218 path=self.local_pdb+os.sep+code[1:3]
219 else:
220 path=self.obsolete_pdb+os.sep+code[1:3]
221 else:
222
223 path=pdir
224
225 if not os.access(path,os.F_OK):
226 os.makedirs(path)
227
228 filename=path+os.sep+filename
229
230 final_file=path+os.sep+"pdb%s.ent" % code
231
232
233 if not self.overwrite:
234 if os.path.exists(final_file):
235 print "file exists, not retrieved",final_file
236 return final_file
237
238
239 print 'retrieving',url
240 lines=urllib.urlopen(url).read()
241 open(filename,'wb').write(lines)
242
243 os.system("%s %s" % (uncompress, filename))
244
245 return final_file
246
247
248
250 """
251 I guess this is the 'most wanted' function from this module.
252 It gets the weekly lists of new and modified pdb entries and
253 automatically downloads the according PDB files.
254 You can call this module as a weekly cronjob.
255 """
256 changes = self.get_recent_changes()
257 new = changes[0]
258 modified = changes[1]
259 obsolete = changes[2]
260
261 for pdb_code in new+modified:
262 try:
263 print 'retrieving %s'%(pdb_code)
264 self.retrieve_pdb_file(pdb_code)
265 except:
266 print 'error %s'%(pdb_code)
267
268
269
270
271 for pdb_code in obsolete:
272 if self.flat_tree:
273 old_file = self.local_pdb + os.sep + 'pdb%s.ent'%(pdb_code)
274 new_file = self.obsolete_pdb + os.sep + 'pdb%s.ent'%(pdb_code)
275 else:
276 old_file = self.local_pdb + os.sep + pdb_code[1:3] + os.sep + 'pdb%s.ent'%(pdb_code)
277 new_file = self.obsolete_pdb + os.sep + pdb_code[1:3] + os.sep + 'pdb%s.ent'%(pdb_code)
278 os.cmd('mv %s %s'%(old_file,new_file))
279
280
282 """Retrieves all PDB entries not present in the local PDB copy.
283 Writes a list file containing all PDB codes (optional, if listfile is given).
284 """
285 entries = self.get_all_entries()
286 for pdb_code in entries: self.retrieve_pdb_file(pdb_code)
287
288
289 if listfile:
290 open(listfile,'w').writelines(map(lambda x: x+'\n',entries))
291
292
294
295 """Retrieves all obsolete PDB entries not present in the local obsolete PDB copy.
296 Writes a list file containing all PDB codes (optional, if listfile is given).
297 """
298 entries = self.get_all_obsolete()
299 for pdb_code in entries: self.retrieve_pdb_file(pdb_code,obsolete=1)
300
301
302 if listfile:
303 open(listfile,'w').writelines(map(lambda x: x+'\n',entries))
304
305
306
307
308
309
310
312 """Retrieves a (big) file containing all the sequences
313 of PDB entries and writes it to a file."""
314 print "retrieving sequence file. Takes about 15 MB."
315 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/pdb_seqres.txt')
316 file = url.readlines()
317 open(savefile,'w').writelines(file)
318
319
320
321 if __name__ == '__main__':
322 doc = """PDBList.py
323 (c) Kristian Rother 2003, Contributed to BioPython
324
325 Usage:
326 PDBList.py update <pdb_path> [options] - write weekly PDB updates to
327 local pdb tree.
328 PDBList.py all <pdb_path> [options] - write all PDB entries to
329 local pdb tree.
330 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB
331 entries to local pdb tree.
332 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure
333
334 Options:
335 -d A single directory will be used as <pdb_path>, not a tree.
336 -o Overwrite existing structure files.
337 """
338 print doc
339
340 if len(sys.argv)>2:
341 pdb_path = sys.argv[2]
342 pl = PDBList(pdb=pdb_path)
343 if len(sys.argv)>3:
344 for option in sys.argv[3:]:
345 if option == '-d': pl.flat_tree = 1
346 elif option == '-o': pl.overwrite = 1
347
348 else:
349 pdb_path = os.getcwd()
350 pl = PDBList()
351 pl.flat_tree = 1
352
353 if len(sys.argv) > 1:
354 if sys.argv[1] == 'update':
355
356 print "updating local PDB at "+pdb_path
357 pl.update_pdb()
358
359 elif sys.argv[1] == 'all':
360
361 pl.download_entire_pdb()
362
363 elif sys.argv[1] == 'obsol':
364
365 pl.download_obsolete_entries(pdb_path)
366
367 elif re.search('^\d...$',sys.argv[1]):
368
369 pl.retrieve_pdb_file(sys.argv[1],pdir=pdb_path)
370