1 import warnings
2 warnings.warn("Bio.expressions was deprecated, as it does not work with recent versions of mxTextTools. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning)
3
4
5 from Martel import *
6 from Martel import Time
7 from Bio import Std
8 import ncbiblast
9
10 wublast_version = Re("2\.[^-]+-WashU")
11
12
13
14 blastn_appheader = replace_groups(ncbiblast.blastn_appheader,
15 [("appversion", wublast_version)])
16 blastp_appheader = replace_groups(ncbiblast.blastp_appheader,
17 [("appversion", wublast_version)])
18 blastx_appheader = replace_groups(ncbiblast.blastx_appheader,
19 [("appversion", wublast_version)])
20 tblastn_appheader = replace_groups(ncbiblast.tblastn_appheader,
21 [("appversion", wublast_version)])
22 tblastx_appheader = replace_groups(ncbiblast.tblastx_appheader,
23 [("appversion", wublast_version)])
24
25 spaces_line = ncbiblast.spaces_line
26 Number = ncbiblast.Number
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44 query_database = (Str("Database:") + Opt(Spaces()) +
45 Std.database_name(UntilEol()) + AnyEol() +
46 Spaces() +
47 Std.database_num_sequences(Number(),
48 {"bioformat:decode": "int.comma"}) +
49 Str(" sequences;") + Spaces() +
50 Std.database_num_letters(Number(),
51 {"bioformat:decode": "int.comma"}) +
52 Spaces() + Str("total letters.") + ToEol())
53
54
55
56
57
58
59
60
61
62
63
64 to_table = (SkipLinesUntil(Str("Sequences producing High-scoring Segment")) +
65 ToEol() +
66 AnyEol())
67
68 table_entry = (AssertNot(AnyEol()) +
69 Std.search_table_description(Re(r".{60}")) + Spaces() +
70 Std.search_table_value(Float(), {"name": "score",
71 "bioformat:decode": "float"}) +
72 Spaces() +
73 Std.search_table_value(Float(), {"name": "P",
74 "bioformat:decode": "float"}) +
75 Spaces() +
76 Std.search_table_value(Digits(), {"name": "N",
77 "bioformat:decode": "float"}) +
78 AnyEol())
79
80 header = replace_groups(ncbiblast.header,
81 (("query_database", query_database),
82 ("to_table", to_table),
83 ("table_entry", table_entry)))
84
85
86
87
88
89
90
91
92
93
94
95 hit_descr = ncbiblast.hit_descr
96 hit_length = ncbiblast.hit_length
97
98
99
100
101
102
103 blastp_score = (Re(r" *Score = *") +
104 Std.hsp_value(Digits(), {"name": "bits",
105 "bioformat:decode": "int",
106 }) +
107
108 ToSep(sep = "=") +
109 Spaces() +
110 Std.hsp_value(Float(), {"name": "expect",
111 "bioformat:decode": "float",
112 }) +
113
114 (Str(", P =") | Re(r", Sum P\(\d+\) =")) +
115 Spaces() +
116 Std.hsp_value(Float(), {"name": "P",
117 "bioformat:decode": "float",
118 }) +
119 ToEol()
120 )
121
122
123 identities = (Re(r" *Identities = ") +
124 Std.hsp_value(Digits(), {"name": "identical",
125 "bioformat:decode": "int",
126 }) +
127 Str("/") +
128
129 Std.hsp_value(Digits(), {"name": "length",
130 "bioformat:decode": "int",
131 }) +
132 ToSep(sep = ",") +
133 Str(" Positives =") +
134 Spaces() +
135 Std.hsp_value(Digits(), {"name": "positives",
136 "bioformat:decode": "int",
137 }) +
138 ToEol())
139
140 blastp_hsp_header = blastp_score + identities + spaces_line
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160 blastn_strand =Re(r" *(Plus|Minus) Strand HSPs:\R")
161
162 blastn_score = blastp_score
163
164 blastn_identities = (
165 Re(r" *Identities = ") +
166 Std.hsp_value(Digits(), {"name": "identical",
167 "bioformat:decode": "int",
168 }) +
169 Str("/") +
170
171 Std.hsp_value(Digits(), {"name": "length",
172 "bioformat:decode": "int",
173 }) +
174 ToSep(sep = ",") +
175 Str(" Positives =") +
176 Spaces() +
177 Std.hsp_value(Digits(), {"name": "positives",
178 "bioformat:decode": "int",
179 }) +
180 ToSep(sep = ",") + Str(" ") +
181 ncbiblast.strand
182
183 )
184
185
186 blastn_hsp_header = (Opt(blastn_strand + spaces_line) +
187 blastn_score + blastn_identities + spaces_line)
188
189
190
191
192
193
194
195
196
197
198
199
200 blastx_table_entry = (
201 AssertNot(AnyEol()) +
202 Std.search_table_description(Re(r".{57}")) + Spaces() +
203 Std.search_table_value(Integer(), {"name": "frame",
204 "bioformat:decode": "int"}) +
205 Spaces() +
206 Std.search_table_value(Float(), {"name": "score",
207 "bioformat:decode": "float"}) +
208 Spaces() +
209 Std.search_table_value(Float(), {"name": "P",
210 "bioformat:decode": "float"}) +
211 Spaces() +
212 Std.search_table_value(Digits(), {"name": "N",
213 "bioformat:decode": "int"}) +
214 AnyEol())
215
216
217 blastx_to_database = (
218 Str(" Translating both strands of query sequence in all 6 reading frames") +
219 AnyEol() +
220 AnyEol())
221
222 blastx_identities = (
223 Re(r" *Identities = ") +
224 Std.hsp_value(Digits(), {"name": "identical",
225 "bioformat:decode": "int",
226 }) +
227 Str("/") +
228
229 Std.hsp_value(Digits(), {"name": "length",
230 "bioformat:decode": "int",
231 }) +
232 ToSep(sep = ",") +
233 Str(" Positives =") +
234 Spaces() +
235 Std.hsp_value(Digits(), {"name": "positives",
236 "bioformat:decode": "int",
237 }) +
238 ToSep(sep = "=") + Str(" ") +
239 Std.hsp_frame(Integer(), {"which": "query"}) +
240 AnyEol()
241 )
242
243 blastx_score = blastp_score
244 blastx_strand = blastn_strand
245
246 blastx_hsp_header = (Opt(blastx_strand + spaces_line) +
247 blastx_score + blastx_identities + spaces_line)
248
249
250
251
252 tblastn_strand =Re(r" *(Plus|Minus) Strand HSPs:\R")
253
254 tblastn_score = blastp_score
255
256 tblastn_identities = (
257 Re(r" *Identities = ") +
258 Std.hsp_value(Digits(), {"name": "identical",
259 "bioformat:decode": "int",
260 }) +
261 Str("/") +
262
263 Std.hsp_value(Digits(), {"name": "length",
264 "bioformat:decode": "int",
265 }) +
266 ToSep(sep = ",") +
267 Str(" Positives =") +
268 Spaces() +
269 Std.hsp_value(Digits(), {"name": "positives",
270 "bioformat:decode": "int",
271 }) +
272 ToSep(sep = ",") +
273 ncbiblast.frame
274
275 )
276
277
278 tblastn_hsp_header = (Opt(tblastn_strand + spaces_line) +
279 tblastn_score + tblastn_identities + spaces_line)
280
281
282
283
284
285 ending_start = (Opt(Str("WARNING") + ToEol() +
286 Rep(AssertNot(Str(">")) +
287 AssertNot(Str("Parameters")) +
288 ToEol())) +
289 Str("Parameters:") + ToEol())
290
291 parameters = SkipLinesUntil(Str("Statistics:"))
292
293 _nv = ncbiblast._nv
294 int_stat = ncbiblast.int_stat
295
296 timestamp = Time.make_expression(
297 "%(Mon) %(Jan) %(day) %(hour):%(minute):%(second) %(YYYY)")
298
299 statistics = (
300 Str("Statistics:") + AnyEol() +
301 spaces_line +
302
303 _nv(" Database: ",
304 Std.search_statistic(UntilEol(), {"name": "database"})) +
305
306
307 _nv(" Title: ",
308 Std.search_statistic(UntilEol(), {"name": "database_title"})) +
309
310
311 _nv(" Release date: ",
312 Std.search_statistic(UntilEol(), {"name": "release_date"})) +
313
314
315 _nv(" Posted date: ",
316 Std.search_statistic(UntilEol(), {"name": "posted_date"})) +
317
318
319 _nv(" Format: ",
320 Std.search_statistic(UntilEol(), {"name": "format"})) +
321
322
323 _nv(" # of letters in database: ",
324 int_stat("num_letters_in_database", comma = 1)) +
325
326
327 _nv(" # of sequences in database: ",
328 int_stat("num_sequences_in_database", comma = 1)) +
329
330
331 _nv(" # of database sequences satisfying E: ",
332 int_stat("num_sequences_satisying_E")) +
333
334
335 _nv(" No. of states in DFA: ",
336 Std.search_statistic(UntilEol(), {"name": "num_dfa_states"})) +
337
338
339 _nv(" Total size of DFA: ",
340 Std.search_statistic(UntilEol(), {"name": "total_dfa_size"})) +
341
342
343 _nv(" Time to generate neighborhood: ",
344 Std.search_statistic(UntilEol(),
345 {"name": "neighborhood_generation_time"})) +
346
347
348 Opt(
349 _nv(" No. of threads or processors used: ",
350 int_stat("num_threads"))) +
351
352
353 _nv(" Search cpu time: ",
354 Std.search_statistic(UntilEol(), {"name": "search_cpu_time"})) +
355
356
357 _nv(" Total cpu time: ",
358 Std.search_statistic(UntilEol(), {"name": "total_time"})) +
359
360
361 _nv(" Start: ",
362 Std.search_statistic(timestamp, {"name": "start_time"}) +
363 Spaces() + Str("End: ") +
364 Std.search_statistic(timestamp, {"name": "end_time"})) +
365 Opt(spaces_line +
366
367 _nv("WARNINGS ISSUED: ",
368 int_stat("num_warnings")))
369 )
370
371 ending = ending_start + parameters + statistics
372 blastp_ending = ending_start + parameters + statistics
373 blastn_ending = ending_start + parameters + statistics
374 blastx_ending = ending_start + parameters + statistics
375
376 format = replace_groups(ncbiblast.format,
377 (("to_table", to_table),
378 ("table_entry", table_entry),
379 ("ending", ending)))
380
381 blastp = replace_groups(format,
382 (("appheader", blastp_appheader),
383 ("hsp_header", blastp_hsp_header)))
384
385 blastn = replace_groups(format,
386 (("appheader", blastn_appheader),
387 ("hsp_header", blastn_hsp_header)))
388
389 blastx = replace_groups(format,
390 (("appheader", blastx_appheader),
391 ("table_entry", blastx_table_entry),
392 ("to_database", blastx_to_database),
393 ("hsp_header", blastx_hsp_header)))
394 tblastn = replace_groups(format,
395 (("appheader", tblastn_appheader),
396 ("table_entry", blastx_table_entry),
397 ("hsp_header", tblastn_hsp_header)))
398
399
401 from xml.sax import saxutils
402 parser = blastn.make_parser(debug_level = 0)
403 parser.setContentHandler(saxutils.XMLGenerator())
404 parser.parse(outf)
405
406
407 if __name__ == "__main__":
408 main()
409