Package Bio :: Module MarkovModel
[hide private]
[frames] | no frames]

Source Code for Module Bio.MarkovModel

  1  """ 
  2  This is an implementation of a state-emitting MarkovModel.  I am using 
  3  terminology similar to Manning and Schutze. 
  4   
  5   
  6   
  7  Functions: 
  8  train_bw        Train a markov model using the Baum-Welch algorithm. 
  9  train_visible   Train a visible markov model using MLE. 
 10  find_states     Find the a state sequence that explains some observations. 
 11   
 12  load            Load a MarkovModel. 
 13  save            Save a MarkovModel. 
 14   
 15  Classes: 
 16  MarkovModel     Holds the description of a markov model 
 17  """ 
 18   
 19  import numpy 
 20   
 21   
22 -def itemindex(values):
23 d = {} 24 entries = enumerate(values[::-1]) 25 n = len(values)-1 26 for index, key in entries: d[key] = n-index 27 return d
28 29 numpy.random.seed() 30 31 VERY_SMALL_NUMBER = 1E-300 32 LOG0 = numpy.log(VERY_SMALL_NUMBER) 33
34 -class MarkovModel:
35 - def __init__(self, states, alphabet, 36 p_initial=None, p_transition=None, p_emission=None):
37 self.states = states 38 self.alphabet = alphabet 39 self.p_initial = p_initial 40 self.p_transition = p_transition 41 self.p_emission = p_emission
42 - def __str__(self):
43 import StringIO 44 handle = StringIO.StringIO() 45 save(self, handle) 46 handle.seek(0) 47 return handle.read()
48
49 -def _readline_and_check_start(handle, start):
50 line = handle.readline() 51 if not line.startswith(start): 52 raise ValueError("I expected %r but got %r" % (start, line)) 53 return line
54
55 -def load(handle):
56 """load(handle) -> MarkovModel()""" 57 # Load the states. 58 line = _readline_and_check_start(handle, "STATES:") 59 states = line.split()[1:] 60 61 # Load the alphabet. 62 line = _readline_and_check_start(handle, "ALPHABET:") 63 alphabet = line.split()[1:] 64 65 mm = MarkovModel(states, alphabet) 66 N, M = len(states), len(alphabet) 67 68 # Load the initial probabilities. 69 mm.p_initial = numpy.zeros(N) 70 line = _readline_and_check_start(handle, "INITIAL:") 71 for i in range(len(states)): 72 line = _readline_and_check_start(handle, " %s:" % states[i]) 73 mm.p_initial[i] = float(line.split()[-1]) 74 75 # Load the transition. 76 mm.p_transition = numpy.zeros((N, N)) 77 line = _readline_and_check_start(handle, "TRANSITION:") 78 for i in range(len(states)): 79 line = _readline_and_check_start(handle, " %s:" % states[i]) 80 mm.p_transition[i,:] = map(float, line.split()[1:]) 81 82 # Load the emission. 83 mm.p_emission = numpy.zeros((N, M)) 84 line = _readline_and_check_start(handle, "EMISSION:") 85 for i in range(len(states)): 86 line = _readline_and_check_start(handle, " %s:" % states[i]) 87 mm.p_emission[i,:] = map(float, line.split()[1:]) 88 89 return mm
90
91 -def save(mm, handle):
92 """save(mm, handle)""" 93 # This will fail if there are spaces in the states or alphabet. 94 w = handle.write 95 w("STATES: %s\n" % ' '.join(mm.states)) 96 w("ALPHABET: %s\n" % ' '.join(mm.alphabet)) 97 w("INITIAL:\n") 98 for i in range(len(mm.p_initial)): 99 w(" %s: %g\n" % (mm.states[i], mm.p_initial[i])) 100 w("TRANSITION:\n") 101 for i in range(len(mm.p_transition)): 102 x = map(str, mm.p_transition[i]) 103 w(" %s: %s\n" % (mm.states[i], ' '.join(x))) 104 w("EMISSION:\n") 105 for i in range(len(mm.p_emission)): 106 x = map(str, mm.p_emission[i]) 107 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
108 109 # XXX allow them to specify starting points
110 -def train_bw(states, alphabet, training_data, 111 pseudo_initial=None, pseudo_transition=None, pseudo_emission=None, 112 update_fn=None, 113 ):
114 """train_bw(states, alphabet, training_data[, pseudo_initial] 115 [, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel 116 117 Train a MarkovModel using the Baum-Welch algorithm. states is a list 118 of strings that describe the names of each state. alphabet is a 119 list of objects that indicate the allowed outputs. training_data 120 is a list of observations. Each observation is a list of objects 121 from the alphabet. 122 123 pseudo_initial, pseudo_transition, and pseudo_emission are 124 optional parameters that you can use to assign pseudo-counts to 125 different matrices. They should be matrices of the appropriate 126 size that contain numbers to add to each parameter matrix, before 127 normalization. 128 129 update_fn is an optional callback that takes parameters 130 (iteration, log_likelihood). It is called once per iteration. 131 132 """ 133 N, M = len(states), len(alphabet) 134 if not training_data: 135 raise ValueError("No training data given.") 136 if pseudo_initial!=None: 137 pseudo_initial = asarray(pseudo_initial) 138 if pseudo_initial.shape != (N,): 139 raise ValueError("pseudo_initial not shape len(states)") 140 if pseudo_transition!=None: 141 pseudo_transition = asarray(pseudo_transition) 142 if pseudo_transition.shape != (N,N): 143 raise ValueError("pseudo_transition not shape " + \ 144 "len(states) X len(states)") 145 if pseudo_emission!=None: 146 pseudo_emission = asarray(pseudo_emission) 147 if pseudo_emission.shape != (N,M): 148 raise ValueError("pseudo_emission not shape " + \ 149 "len(states) X len(alphabet)") 150 151 # Training data is given as a list of members of the alphabet. 152 # Replace those with indexes into the alphabet list for easier 153 # computation. 154 training_outputs = [] 155 indexes = itemindex(alphabet) 156 for outputs in training_data: 157 training_outputs.append([indexes[x] for x in outputs]) 158 159 # Do some sanity checking on the outputs. 160 lengths = map(len, training_outputs) 161 if min(lengths) == 0: 162 raise ValueError("I got training data with outputs of length 0") 163 164 # Do the training with baum welch. 165 x = _baum_welch(N, M, training_outputs, 166 pseudo_initial=pseudo_initial, 167 pseudo_transition=pseudo_transition, 168 pseudo_emission=pseudo_emission, 169 update_fn=update_fn) 170 p_initial, p_transition, p_emission = x 171 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
172 173 MAX_ITERATIONS = 1000
174 -def _baum_welch(N, M, training_outputs, 175 p_initial=None, p_transition=None, p_emission=None, 176 pseudo_initial=None, pseudo_transition=None, 177 pseudo_emission=None, update_fn=None):
178 # Returns (p_initial, p_transition, p_emission) 179 if p_initial==None: 180 p_initial = _random_norm(N) 181 else: 182 p_initial = _copy_and_check(p_initial, (N,)) 183 184 if p_transition==None: 185 p_transition = _random_norm((N,N)) 186 else: 187 p_transition = _copy_and_check(p_transition, (N,N)) 188 if p_emission==None: 189 p_emission = _random_norm((N,M)) 190 else: 191 p_emission = _copy_and_check(p_emission, (N,M)) 192 193 # Do all the calculations in log space to avoid underflows. 194 lp_initial, lp_transition, lp_emission = map( 195 numpy.log, (p_initial, p_transition, p_emission)) 196 if pseudo_initial!=None: 197 lpseudo_initial = numpy.log(pseudo_initial) 198 else: 199 lpseudo_initial = None 200 if pseudo_transition!=None: 201 lpseudo_transition = numpy.log(pseudo_transition) 202 else: 203 lpseudo_transition = None 204 if pseudo_emission!=None: 205 lpseudo_emission = numpy.log(pseudo_emission) 206 else: 207 lpseudo_emission = None 208 209 # Iterate through each sequence of output, updating the parameters 210 # to the HMM. Stop when the log likelihoods of the sequences 211 # stops varying. 212 prev_llik = None 213 for i in range(MAX_ITERATIONS): 214 llik = LOG0 215 for outputs in training_outputs: 216 x = _baum_welch_one( 217 N, M, outputs, 218 lp_initial, lp_transition, lp_emission, 219 lpseudo_initial, lpseudo_transition, lpseudo_emission,) 220 llik += x 221 if update_fn is not None: 222 update_fn(i, llik) 223 if prev_llik is not None and numpy.fabs(prev_llik-llik) < 0.1: 224 break 225 prev_llik = llik 226 else: 227 raise RuntimeError("HMM did not converge in %d iterations" \ 228 % MAX_ITERATIONS) 229 230 # Return everything back in normal space. 231 return map(numpy.exp, (lp_initial, lp_transition, lp_emission))
232
233 -def _baum_welch_one(N, M, outputs, 234 lp_initial, lp_transition, lp_emission, 235 lpseudo_initial, lpseudo_transition, lpseudo_emission):
236 # Do one iteration of Baum-Welch based on a sequence of output. 237 # NOTE: This will change the values of lp_initial, lp_transition, 238 # and lp_emission in place. 239 T = len(outputs) 240 fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs) 241 bmat = _backward(N, T, lp_transition, lp_emission, outputs) 242 243 # Calculate the probability of traversing each arc for any given 244 # transition. 245 lp_arc = numpy.zeros((N, N, T)) 246 for t in range(T): 247 k = outputs[t] 248 lp_traverse = numpy.zeros((N, N)) # P going over one arc. 249 for i in range(N): 250 for j in range(N): 251 # P(getting to this arc) 252 # P(making this transition) 253 # P(emitting this character) 254 # P(going to the end) 255 lp = fmat[i][t] + \ 256 lp_transition[i][j] + \ 257 lp_emission[i][k] + \ 258 bmat[j][t+1] 259 lp_traverse[i][j] = lp 260 # Normalize the probability for this time step. 261 lp_arc[:,:,t] = lp_traverse - _logsum(lp_traverse) 262 263 264 # Sum of all the transitions out of state i at time t. 265 lp_arcout_t = numpy.zeros((N, T)) 266 for t in range(T): 267 for i in range(N): 268 lp_arcout_t[i][t] = _logsum(lp_arc[i,:,t]) 269 270 # Sum of all the transitions out of state i. 271 lp_arcout = numpy.zeros(N) 272 for i in range(N): 273 lp_arcout[i] = _logsum(lp_arcout_t[i,:]) 274 275 # UPDATE P_INITIAL. 276 lp_initial = lp_arcout_t[:,0] 277 if lpseudo_initial!=None: 278 lp_initial = _logvecadd(lp_initial, lpseudo_initial) 279 lp_initial = lp_initial - _logsum(lp_initial) 280 281 # UPDATE P_TRANSITION. p_transition[i][j] is the sum of all the 282 # transitions from i to j, normalized by the sum of the 283 # transitions out of i. 284 for i in range(N): 285 for j in range(N): 286 lp_transition[i][j] = _logsum(lp_arc[i,j,:]) - lp_arcout[i] 287 if lpseudo_transition!=None: 288 lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition) 289 lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i]) 290 291 # UPDATE P_EMISSION. lp_emission[i][k] is the sum of all the 292 # transitions out of i when k is observed, divided by the sum of 293 # the transitions out of i. 294 for i in range(N): 295 ksum = numpy.zeros(M)+LOG0 # ksum[k] is the sum of all i with k. 296 for t in range(T): 297 k = outputs[t] 298 for j in range(N): 299 ksum[k] = _logadd(ksum[k], lp_arc[i,j,t]) 300 ksum = ksum - _logsum(ksum) # Normalize 301 if lpseudo_emission!=None: 302 ksum = _logvecadd(ksum, lpseudo_emission[i]) 303 ksum = ksum - _logsum(ksum) # Renormalize 304 lp_emission[i,:] = ksum 305 306 # Calculate the log likelihood of the output based on the forward 307 # matrix. Since the parameters of the HMM has changed, the log 308 # likelihoods are going to be a step behind, and we might be doing 309 # one extra iteration of training. The alternative is to rerun 310 # the _forward algorithm and calculate from the clean one, but 311 # that may be more expensive than overshooting the training by one 312 # step. 313 return _logsum(fmat[:,T])
314
315 -def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
316 # Implement the forward algorithm. This actually calculates a 317 # Nx(T+1) matrix, where the last column is the total probability 318 # of the output. 319 320 matrix = numpy.zeros((N, T+1)) 321 322 # Initialize the first column to be the initial values. 323 matrix[:,0] = lp_initial 324 for t in range(1, T+1): 325 k = outputs[t-1] 326 for j in range(N): 327 # The probability of the state is the sum of the 328 # transitions from all the states from time t-1. 329 lprob = LOG0 330 for i in range(N): 331 lp = matrix[i][t-1] + \ 332 lp_transition[i][j] + \ 333 lp_emission[i][k] 334 lprob = _logadd(lprob, lp) 335 matrix[j][t] = lprob 336 return matrix
337
338 -def _backward(N, T, lp_transition, lp_emission, outputs):
339 matrix = numpy.zeros((N, T+1)) 340 for t in range(T-1, -1, -1): 341 k = outputs[t] 342 for i in range(N): 343 # The probability of the state is the sum of the 344 # transitions from all the states from time t+1. 345 lprob = LOG0 346 for j in range(N): 347 lp = matrix[j][t+1] + \ 348 lp_transition[i][j] + \ 349 lp_emission[i][k] 350 lprob = _logadd(lprob, lp) 351 matrix[i][t] = lprob 352 return matrix
353
354 -def train_visible(states, alphabet, training_data, 355 pseudo_initial=None, pseudo_transition=None, 356 pseudo_emission=None):
357 """train_visible(states, alphabet, training_data[, pseudo_initial] 358 [, pseudo_transition][, pseudo_emission]) -> MarkovModel 359 360 Train a visible MarkovModel using maximum likelihoood estimates 361 for each of the parameters. states is a list of strings that 362 describe the names of each state. alphabet is a list of objects 363 that indicate the allowed outputs. training_data is a list of 364 (outputs, observed states) where outputs is a list of the emission 365 from the alphabet, and observed states is a list of states from 366 states. 367 368 pseudo_initial, pseudo_transition, and pseudo_emission are 369 optional parameters that you can use to assign pseudo-counts to 370 different matrices. They should be matrices of the appropriate 371 size that contain numbers to add to each parameter matrix 372 373 """ 374 N, M = len(states), len(alphabet) 375 if pseudo_initial!=None: 376 pseudo_initial = asarray(pseudo_initial) 377 if pseudo_initial.shape != (N,): 378 raise ValueError("pseudo_initial not shape len(states)") 379 if pseudo_transition!=None: 380 pseudo_transition = asarray(pseudo_transition) 381 if pseudo_transition.shape != (N,N): 382 raise ValueError("pseudo_transition not shape " + \ 383 "len(states) X len(states)") 384 if pseudo_emission!=None: 385 pseudo_emission = asarray(pseudo_emission) 386 if pseudo_emission.shape != (N,M): 387 raise ValueError("pseudo_emission not shape " + \ 388 "len(states) X len(alphabet)") 389 390 # Training data is given as a list of members of the alphabet. 391 # Replace those with indexes into the alphabet list for easier 392 # computation. 393 training_states, training_outputs = [], [] 394 states_indexes = itemindex(states) 395 outputs_indexes = itemindex(alphabet) 396 for toutputs, tstates in training_data: 397 if len(tstates) != len(toutputs): 398 raise ValueError("states and outputs not aligned") 399 training_states.append([states_indexes[x] for x in tstates]) 400 training_outputs.append([outputs_indexes[x] for x in toutputs]) 401 402 x = _mle(N, M, training_outputs, training_states, 403 pseudo_initial, pseudo_transition, pseudo_emission) 404 p_initial, p_transition, p_emission = x 405 406 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
407
408 -def _mle(N, M, training_outputs, training_states, pseudo_initial, 409 pseudo_transition, pseudo_emission):
410 # p_initial is the probability that a sequence of states starts 411 # off with a particular one. 412 p_initial = numpy.zeros(N) 413 if pseudo_initial: 414 p_initial = p_initial + pseudo_initial 415 for states in training_states: 416 p_initial[states[0]] += 1 417 p_initial = _normalize(p_initial) 418 419 # p_transition is the probability that a state leads to the next 420 # one. C(i,j)/C(i) where i and j are states. 421 p_transition = numpy.zeros((N,N)) 422 if pseudo_transition: 423 p_transition = p_transition + pseudo_transition 424 for states in training_states: 425 for n in range(len(states)-1): 426 i, j = states[n], states[n+1] 427 p_transition[i, j] += 1 428 for i in range(len(p_transition)): 429 p_transition[i,:] = p_transition[i,:] / sum(p_transition[i,:]) 430 431 # p_emission is the probability of an output given a state. 432 # C(s,o)|C(s) where o is an output and s is a state. 433 p_emission = numpy.zeros((N,M)) 434 if pseudo_emission: 435 p_emission = p_emission + pseudo_emission 436 p_emission = numpy.ones((N,M)) 437 for outputs, states in zip(training_outputs, training_states): 438 for o, s in zip(outputs, states): 439 p_emission[s, o] += 1 440 for i in range(len(p_emission)): 441 p_emission[i,:] = p_emission[i,:] / sum(p_emission[i,:]) 442 443 return p_initial, p_transition, p_emission
444
445 -def _argmaxes(vector, allowance=None):
446 return [numpy.argmax(vector)]
447
448 -def find_states(markov_model, output):
449 """find_states(markov_model, output) -> list of (states, score)""" 450 mm = markov_model 451 N = len(mm.states) 452 453 # _viterbi does calculations in log space. Add a tiny bit to the 454 # matrices so that the logs will not break. 455 x = mm.p_initial + VERY_SMALL_NUMBER 456 y = mm.p_transition + VERY_SMALL_NUMBER 457 z = mm.p_emission + VERY_SMALL_NUMBER 458 lp_initial, lp_transition, lp_emission = map(numpy.log, (x, y, z)) 459 # Change output into a list of indexes into the alphabet. 460 indexes = itemindex(mm.alphabet) 461 output = [indexes[x] for x in output] 462 463 # Run the viterbi algorithm. 464 results = _viterbi(N, lp_initial, lp_transition, lp_emission, output) 465 466 for i in range(len(results)): 467 states, score = results[i] 468 results[i] = [mm.states[x] for x in states], numpy.exp(score) 469 return results
470
471 -def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
472 # The Viterbi algorithm finds the most likely set of states for a 473 # given output. Returns a list of states. 474 475 T = len(output) 476 # Store the backtrace in a NxT matrix. 477 backtrace = [] # list of indexes of states in previous timestep. 478 for i in range(N): 479 backtrace.append([None] * T) 480 481 # Store the best scores. 482 scores = numpy.zeros((N, T)) 483 scores[:,0] = lp_initial + lp_emission[:,output[0]] 484 for t in range(1, T): 485 k = output[t] 486 for j in range(N): 487 # Find the most likely place it came from. 488 i_scores = scores[:,t-1] + \ 489 lp_transition[:,j] + \ 490 lp_emission[j,k] 491 indexes = _argmaxes(i_scores) 492 scores[j,t] = i_scores[indexes[0]] 493 backtrace[j][t] = indexes 494 495 # Do the backtrace. First, find a good place to start. Then, 496 # we'll follow the backtrace matrix to find the list of states. 497 # In the event of ties, there may be multiple paths back through 498 # the matrix, which implies a recursive solution. We'll simulate 499 # it by keeping our own stack. 500 in_process = [] # list of (t, states, score) 501 results = [] # return values. list of (states, score) 502 indexes = _argmaxes(scores[:,T-1]) # pick the first place 503 for i in indexes: 504 in_process.append((T-1, [i], scores[i][T-1])) 505 while in_process: 506 t, states, score = in_process.pop() 507 if t == 0: 508 results.append((states, score)) 509 else: 510 indexes = backtrace[states[0]][t] 511 for i in indexes: 512 in_process.append((t-1, [i]+states, score)) 513 return results
514
515 -def _normalize(matrix):
516 # Make sure numbers add up to 1.0 517 if len(matrix.shape) == 1: 518 matrix = matrix / float(sum(matrix)) 519 elif len(matrix.shape) == 2: 520 # Normalize by rows. 521 for i in range(len(matrix)): 522 matrix[i,:] = matrix[i,:] / sum(matrix[i,:]) 523 else: 524 raise ValueError("I cannot handle matrixes of that shape") 525 return matrix
526
527 -def _uniform_norm(shape):
528 matrix = numpy.ones(shape) 529 return _normalize(matrix)
530
531 -def _random_norm(shape):
532 matrix = numpy.random.random(shape) 533 return _normalize(matrix)
534
535 -def _copy_and_check(matrix, desired_shape):
536 # Copy the matrix. 537 matrix = numpy.array(matrix, copy=1) 538 # Check the dimensions. 539 if matrix.shape != desired_shape: 540 raise ValuError("Incorrect dimension") 541 # Make sure it's normalized. 542 if len(matrix.shape) == 1: 543 if numpy.fabs(sum(matrix)-1.0) > 0.01: 544 raise ValueError("matrix not normalized to 1.0") 545 elif len(matrix.shape) == 2: 546 for i in range(len(matrix)): 547 if numpy.fabs(sum(matrix[i])-1.0) > 0.01: 548 raise ValueError("matrix %d not normalized to 1.0" % i) 549 else: 550 raise ValueError("I don't handle matrices > 2 dimensions") 551 return matrix
552
553 -def _logadd(logx, logy):
554 if logy - logx > 100: 555 return logy 556 elif logx - logy > 100: 557 return logx 558 minxy = min(logx, logy) 559 return minxy + numpy.log(numpy.exp(logx-minxy) + numpy.exp(logy-minxy))
560
561 -def _logsum(matrix):
562 if len(matrix.shape) > 1: 563 vec = numpy.reshape(matrix, (numpy.product(matrix.shape),)) 564 else: 565 vec = matrix 566 sum = LOG0 567 for num in vec: 568 sum = _logadd(sum, num) 569 return sum
570
571 -def _logvecadd(logvec1, logvec2):
572 assert len(logvec1) == len(logvec2), "vectors aren't the same length" 573 sumvec = numpy.zeros(len(logvec1)) 574 for i in range(len(logvec1)): 575 sumvec[i] = _logadd(logvec1[i], logvec2[i]) 576 return sumvec
577
578 -def _exp_logsum(numbers):
579 sum = _logsum(numbers) 580 return numpy.exp(sum)
581 582 try: 583 import cMarkovModel 584 except ImportError, x: 585 pass 586 else: 587 import sys 588 this_module = sys.modules[__name__] 589 for name in cMarkovModel.__dict__.keys(): 590 if not name.startswith("__"): 591 this_module.__dict__[name] = cMarkovModel.__dict__[name] 592