1 """Substitution matrices, log odds matrices, and operations on them.
2 """
3 import re
4 import sys
5 import copy
6 import math
7
8 from Bio import Alphabet
9 from Bio.SubsMat import FreqTable
10
11
12 log = math.log
13
14 NOTYPE = 0
15 ACCREP = 1
16 OBSFREQ = 2
17 SUBS = 3
18 EXPFREQ = 4
19 LO = 5
20 EPSILON = 0.00000000000001
22 """Exception raised when verifying a matrix"""
25 BadMatrixError = BadMatrix()
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
129 """A Generic sequence matrix class
130 The key is a 2-tuple containing the letter indices of the matrix. Those
131 should be sorted in the tuple (low, high). Because each matrix is dealt
132 with as a half-matrix."""
133
135 ab_dict = {}
136 s = ''
137 for i in self.keys():
138 ab_dict[i[0]] = 1
139 ab_dict[i[1]] = 1
140 letters_list = ab_dict.keys()
141 letters_list.sort()
142 for i in letters_list:
143 s = s + i
144 self.alphabet.letters = s
145
146 - def __init__(self,data=None, alphabet=None,
147 mat_type=NOTYPE,mat_name='',build_later=0):
201
203 keylist = self.keys()
204 for key in keylist:
205 if key[0] > key[1]:
206 self[(key[1],key[0])] = self[key]
207 del self[key]
208
210 """
211 Convert a full-matrix to a half-matrix
212 """
213
214
215
216
217 N = len(self.alphabet.letters)
218
219 if len(self) == N*(N+1)/2:
220 return
221 for i in self.ab_list:
222 for j in self.ab_list[:self.ab_list.index(i)+1]:
223 if i != j:
224 self[j,i] = self[j,i] + self[i,j]
225 del self[i,j]
226
228 for i in self.ab_list:
229 for j in self.ab_list[:self.ab_list.index(i)+1]:
230 self[j,i] = 0.
231
233 """if this matrix is a log-odds matrix, return its entropy
234 Needs the observed frequency matrix for that"""
235 ent = 0.
236 if self.mat_type == LO:
237 for i in self.keys():
238 ent += obs_freq_mat[i]*self[i]/log(2)
239 elif self.mat_type == SUBS:
240 for i in self.keys():
241 if self[i] > EPSILON:
242 ent += obs_freq_mat[i]*log(self[i])/log(2)
243 else:
244 raise TypeError("entropy: substitution or log-odds matrices only")
245 self.relative_entropy = ent
246
248 self.entropy = 0
249 for i in self.keys():
250 if self[i] > EPSILON:
251 self.entropy += self[i]*log(self[i])/log(2)
252 self.entropy = -self.entropy
254 assert letter in self.alphabet.letters
255 sum = 0.
256 for i in self.keys():
257 if letter in i:
258 if i[0] == i[1]:
259 sum += self[i]
260 else:
261 sum += (self[i] / 2.)
262 return sum
263
267 - def print_full_mat(self,f=None,format="%4d",topformat="%4s",
268 alphabet=None,factor=1,non_sym=None):
269 f = f or sys.stdout
270
271
272 assert non_sym == None or type(non_sym) == type(1.) or \
273 type(non_sym) == type(1)
274 full_mat = copy.copy(self)
275 for i in self:
276 if i[0] != i[1]:
277 full_mat[(i[1],i[0])] = full_mat[i]
278 if not alphabet:
279 alphabet = self.ab_list
280 topline = ''
281 for i in alphabet:
282 topline = topline + topformat % i
283 topline = topline + '\n'
284 f.write(topline)
285 for i in alphabet:
286 outline = i
287 for j in alphabet:
288 if alphabet.index(j) > alphabet.index(i) and non_sym is not None:
289 val = non_sym
290 else:
291 val = full_mat[i,j]
292 val *= factor
293 if val <= -999:
294 cur_str = ' ND'
295 else:
296 cur_str = format % val
297
298 outline = outline+cur_str
299 outline = outline+'\n'
300 f.write(outline)
301
302 - def print_mat(self,f=None,format="%4d",bottomformat="%4s",
303 alphabet=None,factor=1):
304 """Print a nice half-matrix. f=sys.stdout to see on the screen
305 User may pass own alphabet, which should contain all letters in the
306 alphabet of the matrix, but may be in a different order. This
307 order will be the order of the letters on the axes"""
308
309 f = f or sys.stdout
310 if not alphabet:
311 alphabet = self.ab_list
312 bottomline = ''
313 for i in alphabet:
314 bottomline = bottomline + bottomformat % i
315 bottomline = bottomline + '\n'
316 for i in alphabet:
317 outline = i
318 for j in alphabet[:alphabet.index(i)+1]:
319 try:
320 val = self[j,i]
321 except KeyError:
322 val = self[i,j]
323 val *= factor
324 if val == -999:
325 cur_str = ' ND'
326 else:
327 cur_str = format % val
328
329 outline = outline+cur_str
330 outline = outline+'\n'
331 f.write(outline)
332 f.write(bottomline)
334 """ returns a number which is the subtraction product of the two matrices"""
335 mat_diff = 0
336 for i in self.keys():
337 mat_diff += (self[i] - other[i])
338 return mat_diff
340 """ returns a matrix for which each entry is the multiplication product of the
341 two matrices passed"""
342 new_mat = copy.copy(self)
343 for i in self.keys():
344 new_mat[i] *= other[i]
345 return new_mat
347 new_mat = copy.copy(self)
348 for i in self.keys():
349 new_mat[i] += other[i]
350 return new_mat
351
353 """
354 build_obs_freq_mat(acc_rep_mat):
355 Build the observed frequency matrix, from an accepted replacements matrix
356 The accRep matrix should be generated by the user.
357 """
358
359 sum = 0.
360 for i in acc_rep_mat.values():
361 sum += i
362 obs_freq_mat = SeqMat(alphabet=acc_rep_mat.alphabet,build_later=1)
363 for i in acc_rep_mat.keys():
364 obs_freq_mat[i] = acc_rep_mat[i]/sum
365 obs_freq_mat.mat_type = OBSFREQ
366 return obs_freq_mat
367
369 exp_freq_table = {}
370 for i in obs_freq_mat.alphabet.letters:
371 exp_freq_table[i] = 0.
372 for i in obs_freq_mat.keys():
373 if i[0] == i[1]:
374 exp_freq_table[i[0]] += obs_freq_mat[i]
375 else:
376 exp_freq_table[i[0]] += obs_freq_mat[i] / 2.
377 exp_freq_table[i[1]] += obs_freq_mat[i] / 2.
378 return FreqTable.FreqTable(exp_freq_table,FreqTable.FREQ)
379
381 """Build an expected frequency matrix
382 exp_freq_table: should be a FreqTable instance
383 """
384 exp_freq_mat = SeqMat(alphabet=exp_freq_table.alphabet,build_later=1)
385 for i in exp_freq_mat.keys():
386 if i[0] == i[1]:
387 exp_freq_mat[i] = exp_freq_table[i[0]]**2
388 else:
389 exp_freq_mat[i] = 2.0*exp_freq_table[i[0]]*exp_freq_table[i[1]]
390 exp_freq_mat.mat_type = EXPFREQ
391 return exp_freq_mat
392
393
394
396 """ Build the substitution matrix """
397 if obs_freq_mat.ab_list != exp_freq_mat.ab_list:
398 raise ValueError("Alphabet mismatch in passed matrices")
399 subs_mat = SeqMat(obs_freq_mat)
400 for i in obs_freq_mat.keys():
401 subs_mat[i] = obs_freq_mat[i]/exp_freq_mat[i]
402
403 subs_mat.mat_type = SUBS
404 return subs_mat
405
406
407
408
410 """_build_log_odds_mat(subs_mat,logbase=10,factor=10.0,round_digit=1):
411 Build a log-odds matrix
412 logbase=2: base of logarithm used to build (default 2)
413 factor=10.: a factor by which each matrix entry is multiplied
414 round_digit: roundoff place after decimal point
415 keep_nd: if true, keeps the -999 value for non-determined values (for which there
416 are no substitutions in the frequency substitutions matrix). If false, plants the
417 minimum log-odds value of the matrix in entries containing -999
418 """
419 lo_mat = SeqMat(subs_mat)
420 for i in subs_mat.keys():
421 if subs_mat[i] < EPSILON:
422 lo_mat[i] = -999
423 else:
424 lo_mat[i] = round(factor*log(subs_mat[i])/log(logbase),round_digit)
425 lo_mat.mat_type = LO
426 mat_min = min(lo_mat.values())
427 if not keep_nd:
428 for i in lo_mat.keys():
429 if lo_mat[i] <= -999:
430 lo_mat[i] = mat_min
431 return lo_mat
432
433
434
435
436
437
438
439 -def make_log_odds_matrix(acc_rep_mat,exp_freq_table=None,logbase=2,
440 factor=1.,round_digit=9,keep_nd=0):
454 -def read_text_matrix(data_file,mat_type=NOTYPE):
455 matrix = {}
456 tmp = data_file.read().split("\n")
457 table=[]
458 for i in tmp:
459 table.append(i.split())
460
461 for rec in table[:]:
462 if (rec.count('#') > 0):
463 table.remove(rec)
464
465
466 while (table.count([]) > 0):
467 table.remove([])
468
469 alphabet = table[0]
470 j = 0
471 for rec in table[1:]:
472
473 row = alphabet[j]
474
475 if re.compile('[A-z\*]').match(rec[0]):
476 first_col = 1
477 else:
478 first_col = 0
479 i = 0
480 for field in rec[first_col:]:
481 col = alphabet[i]
482 matrix[(row,col)] = float(field)
483 i += 1
484 j += 1
485
486 for i in matrix.keys():
487 if '*' in i: del(matrix[i])
488 ret_mat = SeqMat(matrix,mat_type=mat_type)
489 return ret_mat
490
491 diagNO = 1
492 diagONLY = 2
493 diagALL = 3
494
496 rel_ent = 0.
497 key_list_1 = mat_1.keys(); key_list_2 = mat_2.keys()
498 key_list_1.sort(); key_list_2.sort()
499 key_list = []
500 sum_ent_1 = 0.; sum_ent_2 = 0.
501 for i in key_list_1:
502 if i in key_list_2:
503 key_list.append(i)
504 if len(key_list_1) != len(key_list_2):
505
506 sys.stderr.write("Warning:first matrix has more entries than the second\n")
507 if key_list_1 != key_list_2:
508 sys.stderr.write("Warning: indices not the same between matrices\n")
509 for key in key_list:
510 if diag == diagNO and key[0] == key[1]:
511 continue
512 if diag == diagONLY and key[0] != key[1]:
513 continue
514 if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
515 sum_ent_1 += mat_1[key]
516 sum_ent_2 += mat_2[key]
517
518 for key in key_list:
519 if diag == diagNO and key[0] == key[1]:
520 continue
521 if diag == diagONLY and key[0] != key[1]:
522 continue
523 if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
524 val_1 = mat_1[key] / sum_ent_1
525 val_2 = mat_2[key] / sum_ent_2
526
527 rel_ent += val_1 * log(val_1/val_2)/log(logbase)
528 return rel_ent
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
547 assert mat_1.ab_list == mat_2.ab_list
548 assert pi_1 > 0 and pi_2 > 0 and pi_1< 1 and pi_2 <1
549 assert not (pi_1 + pi_2 - 1.0 > EPSILON)
550 sum_mat = SeqMat(build_later=1)
551 sum_mat.ab_list = mat_1.ab_list
552 for i in mat_1.keys():
553 sum_mat[i] = pi_1 * mat_1[i] + pi_2 * mat_2[i]
554 sum_mat.make_entropy()
555 mat_1.make_entropy()
556 mat_2.make_entropy()
557
558 dJS = sum_mat.entropy - pi_1 * mat_1.entropy - pi_2 *mat_2.entropy
559 return dJS
560
561 """
562 This isn't working yet. Boo hoo!
563 def two_mat_print(mat_1, mat_2, f=None,alphabet=None,factor_1=1, factor_2=1,
564 format="%4d",bottomformat="%4s",topformat="%4s",
565 topindent=7*" ", bottomindent=1*" "):
566 f = f or sys.stdout
567 if not alphabet:
568 assert mat_1.ab_list == mat_2.ab_list
569 alphabet = mat_1.ab_list
570 len_alphabet = len(alphabet)
571 print_mat = {}
572 topline = topindent
573 bottomline = bottomindent
574 for i in alphabet:
575 bottomline += bottomformat % i
576 topline += topformat % alphabet[len_alphabet-alphabet.index(i)-1]
577 topline += '\n'
578 bottomline += '\n'
579 f.write(topline)
580 for i in alphabet:
581 for j in alphabet:
582 print_mat[i,j] = -999
583 diag_1 = {}; diag_2 = {}
584 for i in alphabet:
585 for j in alphabet[:alphabet.index(i)+1]:
586 if i == j:
587 diag_1[i] = mat_1[(i,i)]
588 diag_2[i] = mat_2[(alphabet[len_alphabet-alphabet.index(i)-1],
589 alphabet[len_alphabet-alphabet.index(i)-1])]
590 else:
591 if i > j:
592 key = (j,i)
593 else:
594 key = (i,j)
595 mat_2_key = [alphabet[len_alphabet-alphabet.index(key[0])-1],
596 alphabet[len_alphabet-alphabet.index(key[1])-1]]
597 # print mat_2_key
598 mat_2_key.sort(); mat_2_key = tuple(mat_2_key)
599 # print key ,"||", mat_2_key
600 print_mat[key] = mat_2[mat_2_key]
601 print_mat[(key[1],key[0])] = mat_1[key]
602 for i in alphabet:
603 outline = i
604 for j in alphabet:
605 if i == j:
606 if diag_1[i] == -999:
607 val_1 = ' ND'
608 else:
609 val_1 = format % (diag_1[i]*factor_1)
610 if diag_2[i] == -999:
611 val_2 = ' ND'
612 else:
613 val_2 = format % (diag_2[i]*factor_2)
614 cur_str = val_1 + " " + val_2
615 else:
616 if print_mat[(i,j)] == -999:
617 val = ' ND'
618 elif alphabet.index(i) > alphabet.index(j):
619 val = format % (print_mat[(i,j)]*factor_1)
620 else:
621 val = format % (print_mat[(i,j)]*factor_2)
622 cur_str = val
623 outline += cur_str
624 outline += bottomformat % (alphabet[len_alphabet-alphabet.index(i)-1] +
625 '\n')
626 f.write(outline)
627 f.write(bottomline)
628 """
629