Package Bio :: Package expressions :: Package blast :: Module wublast
[hide private]
[frames] | no frames]

Source Code for Module Bio.expressions.blast.wublast

  1  import warnings 
  2  warnings.warn("Bio.expressions was deprecated, as it does not work with recent versions of mxTextTools. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
  3   
  4   
  5  from Martel import * 
  6  from Martel import Time 
  7  from Bio import Std 
  8  import ncbiblast 
  9   
 10  wublast_version = Re("2\.[^-]+-WashU") 
 11   
 12  # BLASTN 2.0a19MP-WashU [05-Feb-1998] [Build decunix3.2 01:53:29 05-Feb-1998] 
 13  # BLASTP 2.0a19MP-WashU [05-Feb-1998] [Build sol2.5-ultra 01:47:24 05-Feb-1998] 
 14  blastn_appheader = replace_groups(ncbiblast.blastn_appheader, 
 15                                    [("appversion", wublast_version)]) 
 16  blastp_appheader = replace_groups(ncbiblast.blastp_appheader, 
 17                                    [("appversion", wublast_version)]) 
 18  blastx_appheader = replace_groups(ncbiblast.blastx_appheader, 
 19                                    [("appversion", wublast_version)]) 
 20  tblastn_appheader = replace_groups(ncbiblast.tblastn_appheader, 
 21                                    [("appversion", wublast_version)]) 
 22  tblastx_appheader = replace_groups(ncbiblast.tblastx_appheader, 
 23                                     [("appversion", wublast_version)]) 
 24   
 25  spaces_line = ncbiblast.spaces_line 
 26  Number = ncbiblast.Number 
 27   
 28  # Database:  YeastORF-P 
 29  #            6183 sequences; 2,900,438 total letters. 
 30   
 31  # Database:  Non-redundant GenBank CDS translations+PDB+SwissProt+SPupdate+PIR 
 32  #            307,320 sequences; 92,696,426 total letters. 
 33   
 34  # Database: mgpep 
 35  #            468 sequences; 170,400 total letters 
 36   
 37  # Database:  plantPept 
 38  #            6418 sequences; 2,370,771 total letters. 
 39   
 40  # Database: Non-redundant GenBank CDS 
 41  # translations+PDB+SwissProt+SPupdate+PIR 
 42  #            301,269 sequences; 90,873,415 total letters 
 43   
 44  query_database = (Str("Database:") + Opt(Spaces()) + 
 45                    Std.database_name(UntilEol()) + AnyEol() + 
 46                    Spaces() + 
 47                    Std.database_num_sequences(Number(), 
 48                                             {"bioformat:decode": "int.comma"}) + 
 49                    Str(" sequences;") + Spaces() +  
 50                    Std.database_num_letters(Number(), 
 51                                             {"bioformat:decode": "int.comma"}) + 
 52                    Spaces() + Str("total letters.") + ToEol()) 
 53  #                        notice the "."  -----^^^ 
 54   
 55   
 56  #                                                                      Smallest 
 57  #                                                                        Sum 
 58  #                                                               High  Probability 
 59  # Sequences producing High-scoring Segment Pairs:              Score  P(N)      N 
 60  #   
 61  # sp|P09429|HMG1_HUMAN HIGH MOBILITY GROUP PROTEIN HMG1 (HM...  1144  8.6e-117  1 
 62  # sp|P10103|HMG1_BOVIN HIGH MOBILITY GROUP PROTEIN HMG1 (HM...  1140  2.3e-116  1 
 63   
 64  to_table = (SkipLinesUntil(Str("Sequences producing High-scoring Segment")) + 
 65              ToEol() + 
 66              AnyEol()) 
 67   
 68  table_entry = (AssertNot(AnyEol()) + 
 69                 Std.search_table_description(Re(r".{60}")) + Spaces() + 
 70                 Std.search_table_value(Float(), {"name": "score", 
 71                                                  "bioformat:decode": "float"}) + 
 72                 Spaces() + 
 73                 Std.search_table_value(Float(), {"name": "P", 
 74                                                  "bioformat:decode": "float"}) + 
 75                 Spaces() + 
 76                 Std.search_table_value(Digits(), {"name": "N", 
 77                                                "bioformat:decode": "float"}) + 
 78                 AnyEol()) 
 79        
 80  header = replace_groups(ncbiblast.header, 
 81                          (("query_database", query_database), 
 82                           ("to_table", to_table), 
 83                           ("table_entry", table_entry))) 
 84   
 85  # >sp|P09429|HMG1_HUMAN HIGH MOBILITY GROUP PROTEIN HMG1 (HMG-1). 
 86  #             Length = 214 
 87  #   
 88  #  Score = 1144 (402.7 bits), Expect = 8.6e-117, P = 8.6e-117 
 89  #  Identities = 214/214 (100%), Positives = 214/214 (100%) 
 90  #   
 91  # Query:     1 GKGDPKKPRGKMSSYAFFVQTCREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFE 60 
 92  #              GKGDPKKPRGKMSSYAFFVQTCREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFE 
 93  # Sbjct:     1 GKGDPKKPRGKMSSYAFFVQTCREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFE 60 
 94   
 95  hit_descr = ncbiblast.hit_descr 
 96  hit_length = ncbiblast.hit_length 
 97   
 98   
 99  #  Score = 1144 (402.7 bits), Expect = 8.6e-117, P = 8.6e-117 
100  #  Score = 79 (27.8 bits), Expect = 0.0088, Sum P(2) = 0.0088 
101  #  Score = 65 (22.9 bits), Expect = 3.1, Sum P(3) = 0.96 
102   
103  blastp_score = (Re(r" *Score = *") + 
104                  Std.hsp_value(Digits(), {"name": "bits", 
105                                           "bioformat:decode": "int", 
106                                           }) + 
107                  # XXX Skipping the "402.7 bits" field, what's it called? 
108                  ToSep(sep = "=") + 
109                  Spaces() + 
110                  Std.hsp_value(Float(), {"name": "expect", 
111                                           "bioformat:decode": "float", 
112                                           }) + 
113                  # What should I do with the (2)?, or the (3)? 
114                  (Str(", P =") | Re(r", Sum P\(\d+\) =")) +  
115                  Spaces() + 
116                  Std.hsp_value(Float(), {"name": "P", 
117                                           "bioformat:decode": "float", 
118                                           }) + 
119                  ToEol() 
120                  ) 
121   
122  #  Identities = 214/214 (100%), Positives = 214/214 (100%) 
123  identities = (Re(r" *Identities = ") + 
124                Std.hsp_value(Digits(), {"name": "identical", 
125                                         "bioformat:decode": "int", 
126                                         }) + 
127                Str("/") + 
128                 
129                Std.hsp_value(Digits(), {"name": "length", 
130                                         "bioformat:decode": "int", 
131                                         }) + 
132                ToSep(sep = ",") + 
133                Str(" Positives =") + 
134                Spaces() + 
135                Std.hsp_value(Digits(), {"name": "positives", 
136                                         "bioformat:decode": "int", 
137                                         }) + 
138                ToEol()) 
139   
140  blastp_hsp_header = blastp_score + identities + spaces_line 
141   
142   
143  ######################### blastn_hit 
144   
145   
146  # >U51677 Human non-histone chromatin protein HMG1 (HMG1) gene, complete cds. 
147  #         Length = 2575 
148  #   
149  #   Plus Strand HSPs: 
150  #   
151  #  Score = 12875 (1931.8 bits), Expect = 0.0, P = 0.0 
152  #  Identities = 2575/2575 (100%), Positives = 2575/2575 (100%), Strand = Plus / Plus 
153  #   
154  # Query:     1 ATGGGCAAAGGAGATCCTAAGAAGCCGAGAGGCAAAATGTCATCATATGCATTTTTTGTG 60 
155  #              |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 
156  # Sbjct:     1 ATGGGCAAAGGAGATCCTAAGAAGCCGAGAGGCAAAATGTCATCATATGCATTTTTTGTG 60 
157  # .... 
158   
159  # Don't need to parse this since it comes from the identities line 
160  blastn_strand =Re(r" *(Plus|Minus) Strand HSPs:\R") 
161   
162  blastn_score = blastp_score 
163   
164  blastn_identities = ( 
165      Re(r" *Identities = ") + 
166      Std.hsp_value(Digits(), {"name": "identical", 
167                               "bioformat:decode": "int", 
168                               }) + 
169      Str("/") + 
170       
171      Std.hsp_value(Digits(), {"name": "length", 
172                               "bioformat:decode": "int", 
173                               }) + 
174      ToSep(sep = ",") + 
175      Str(" Positives =") + 
176      Spaces() + 
177      Std.hsp_value(Digits(), {"name": "positives", 
178                               "bioformat:decode": "int", 
179                               }) + 
180      ToSep(sep = ",") + Str(" ") + 
181      ncbiblast.strand 
182      # that 'strand' already includes a newline 
183      ) 
184   
185   
186  blastn_hsp_header = (Opt(blastn_strand + spaces_line) + 
187                       blastn_score + blastn_identities + spaces_line) 
188   
189  ######################### blastx 
190   
191  #                                                                      Smallest 
192  #                                                                        Sum 
193  #                                                      Reading  High  Probability 
194  # Sequences producing High-scoring Segment Pairs:        Frame Score  P(N)      N 
195  #   
196  # sp|P09429|HMG1_HUMAN HIGH MOBILITY GROUP PROTEIN HMG1 ... +2   311  5.0e-108  4 
197  # sp|P10103|HMG1_BOVIN HIGH MOBILITY GROUP PROTEIN HMG1 ... +2   311  1.3e-107  4 
198  # sp|P07155|HMG1_MOUSE HIGH MOBILITY GROUP PROTEIN HMG1 ... +2   311  2.7e-107  4                                         
199   
200  blastx_table_entry = ( 
201      AssertNot(AnyEol()) + 
202      Std.search_table_description(Re(r".{57}")) + Spaces() + 
203      Std.search_table_value(Integer(), {"name": "frame", 
204                                         "bioformat:decode": "int"}) + 
205      Spaces() + 
206      Std.search_table_value(Float(), {"name": "score", 
207                                       "bioformat:decode": "float"}) + 
208      Spaces() + 
209      Std.search_table_value(Float(), {"name": "P", 
210                                       "bioformat:decode": "float"}) + 
211      Spaces() + 
212      Std.search_table_value(Digits(), {"name": "N", 
213                                        "bioformat:decode": "int"}) + 
214      AnyEol()) 
215   
216   
217  blastx_to_database = ( 
218   Str("  Translating both strands of query sequence in all 6 reading frames") + 
219   AnyEol() + 
220   AnyEol()) 
221   
222  blastx_identities = ( 
223      Re(r" *Identities = ") + 
224      Std.hsp_value(Digits(), {"name": "identical", 
225                               "bioformat:decode": "int", 
226                               }) + 
227      Str("/") + 
228       
229      Std.hsp_value(Digits(), {"name": "length", 
230                               "bioformat:decode": "int", 
231                               }) + 
232      ToSep(sep = ",") + 
233      Str(" Positives =") + 
234      Spaces() + 
235      Std.hsp_value(Digits(), {"name": "positives", 
236                               "bioformat:decode": "int", 
237                               }) + 
238      ToSep(sep = "=") + Str(" ") + 
239      Std.hsp_frame(Integer(), {"which": "query"}) + 
240      AnyEol() 
241      ) 
242   
243  blastx_score = blastp_score 
244  blastx_strand = blastn_strand 
245   
246  blastx_hsp_header = (Opt(blastx_strand + spaces_line) + 
247                       blastx_score + blastx_identities + spaces_line) 
248   
249   
250  ############################# 
251  ######################### tblastn 
252  tblastn_strand =Re(r" *(Plus|Minus) Strand HSPs:\R") 
253   
254  tblastn_score = blastp_score 
255   
256  tblastn_identities = ( 
257      Re(r" *Identities = ") + 
258      Std.hsp_value(Digits(), {"name": "identical", 
259                               "bioformat:decode": "int", 
260                               }) + 
261      Str("/") + 
262       
263      Std.hsp_value(Digits(), {"name": "length", 
264                               "bioformat:decode": "int", 
265                               }) + 
266      ToSep(sep = ",") + 
267      Str(" Positives =") + 
268      Spaces() + 
269      Std.hsp_value(Digits(), {"name": "positives", 
270                               "bioformat:decode": "int", 
271                               }) + 
272      ToSep(sep = ",") +  
273      ncbiblast.frame 
274      # that 'strand' already includes a newline 
275      ) 
276   
277   
278  tblastn_hsp_header = (Opt(tblastn_strand + spaces_line) + 
279                       tblastn_score + tblastn_identities + spaces_line) 
280   
281   
282   
283  ############################# 
284  # May have to skip the WARNING section 
285  ending_start = (Opt(Str("WARNING") + ToEol() + 
286                      Rep(AssertNot(Str(">")) + 
287                          AssertNot(Str("Parameters")) + 
288                          ToEol())) + 
289                  Str("Parameters:") + ToEol()) 
290   
291  parameters = SkipLinesUntil(Str("Statistics:")) 
292   
293  _nv = ncbiblast._nv 
294  int_stat = ncbiblast.int_stat 
295   
296  timestamp = Time.make_expression( 
297      "%(Mon) %(Jan) %(day) %(hour):%(minute):%(second) %(YYYY)") 
298   
299  statistics = ( 
300      Str("Statistics:") + AnyEol() + 
301      spaces_line + 
302          #  Database:  /dbEST1/wublast/database/swissprot/sprot.fa 
303      _nv("  Database: ", 
304          Std.search_statistic(UntilEol(), {"name": "database"})) + 
305       
306          #    Title:  swissprot 
307      _nv("    Title: ", 
308          Std.search_statistic(UntilEol(), {"name": "database_title"})) + 
309       
310          #    Release date:  unknown 
311      _nv("    Release date: ", 
312          Std.search_statistic(UntilEol(), {"name": "release_date"})) + 
313       
314          #    Posted date:  11:37 AM GMT Feb 21, 2000 
315      _nv("    Posted date: ", 
316          Std.search_statistic(UntilEol(), {"name": "posted_date"})) + 
317       
318          #    Format:  BLAST 
319      _nv("    Format: ", 
320          Std.search_statistic(UntilEol(), {"name": "format"})) + 
321       
322          #  # of letters in database:  26,335,942 
323      _nv("  # of letters in database:  ", 
324          int_stat("num_letters_in_database", comma = 1)) + 
325       
326          #  # of sequences in database:  72,615 
327      _nv("  # of sequences in database:  ", 
328          int_stat("num_sequences_in_database", comma = 1)) + 
329   
330          #  # of database sequences satisfying E:  1119 
331      _nv("  # of database sequences satisfying E:  ", 
332          int_stat("num_sequences_satisying_E")) + 
333   
334          #  No. of states in DFA:  549 (54 KB) 
335      _nv("  No. of states in DFA: ", 
336          Std.search_statistic(UntilEol(), {"name": "num_dfa_states"})) + 
337           
338          #  Total size of DFA:  117 KB (128 KB) 
339      _nv("  Total size of DFA: ", 
340          Std.search_statistic(UntilEol(), {"name": "total_dfa_size"})) + 
341   
342          #  Time to generate neighborhood:  0.01u 0.00s 0.01t  Elapsed: 00:00:00 
343      _nv("  Time to generate neighborhood: ", 
344          Std.search_statistic(UntilEol(), 
345                               {"name": "neighborhood_generation_time"})) + 
346   
347          #  No. of threads or processors used:  2 
348      Opt( 
349          _nv("  No. of threads or processors used:  ", 
350                  int_stat("num_threads"))) + 
351   
352          #  Search cpu time:  38.04u 0.15s 38.19t  Elapsed: 00:00:33 
353      _nv("  Search cpu time: ", 
354          Std.search_statistic(UntilEol(), {"name": "search_cpu_time"})) + 
355           
356          #  Total cpu time:  38.96u 0.29s 39.25t  Elapsed: 00:00:34 
357      _nv("  Total cpu time: ", 
358          Std.search_statistic(UntilEol(), {"name": "total_time"})) + 
359   
360          #  Start:  Mon Feb 21 14:33:13 2000   End:  Mon Feb 21 14:33:47 2000 
361      _nv("  Start:  ", 
362          Std.search_statistic(timestamp, {"name": "start_time"}) + 
363          Spaces() + Str("End:  ") + 
364          Std.search_statistic(timestamp, {"name": "end_time"})) + 
365      Opt(spaces_line + 
366              #WARNINGS ISSUED:  2 
367          _nv("WARNINGS ISSUED:  ", 
368              int_stat("num_warnings"))) 
369      ) 
370   
371  ending = ending_start + parameters + statistics 
372  blastp_ending = ending_start + parameters + statistics 
373  blastn_ending = ending_start + parameters + statistics 
374  blastx_ending = ending_start + parameters + statistics 
375   
376  format = replace_groups(ncbiblast.format, 
377                          (("to_table", to_table), 
378                           ("table_entry", table_entry), 
379                           ("ending", ending))) 
380   
381  blastp = replace_groups(format, 
382                          (("appheader", blastp_appheader), 
383                           ("hsp_header", blastp_hsp_header))) 
384   
385  blastn = replace_groups(format, 
386                          (("appheader", blastn_appheader), 
387                           ("hsp_header", blastn_hsp_header))) 
388   
389  blastx = replace_groups(format, 
390                          (("appheader", blastx_appheader), 
391                           ("table_entry", blastx_table_entry), 
392                           ("to_database", blastx_to_database), 
393                           ("hsp_header", blastx_hsp_header))) 
394  tblastn = replace_groups(format, 
395                          (("appheader", tblastn_appheader), 
396                           ("table_entry", blastx_table_entry), 
397                           ("hsp_header", tblastn_hsp_header))) 
398                            
399   
400 -def main(outf):
401 from xml.sax import saxutils 402 parser = blastn.make_parser(debug_level = 0) 403 parser.setContentHandler(saxutils.XMLGenerator()) 404 parser.parse(outf)
405 #parser.parse("wu-sh_blastp.out") 406 407 if __name__ == "__main__": 408 main() 409