Package Bio :: Package HMM :: Module MarkovModel
[hide private]
[frames] | no frames]

Source Code for Module Bio.HMM.MarkovModel

  1  """Deal with representations of Markov Models. 
  2  """ 
  3  # standard modules 
  4  import copy 
  5  import math 
  6  import random 
  7   
  8  # biopython 
  9  from Bio.Seq import MutableSeq 
 10   
11 -class MarkovModelBuilder:
12 """Interface to build up a Markov Model. 13 14 This class is designed to try to separate the task of specifying the 15 Markov Model from the actual model itself. This is in hopes of making 16 the actual Markov Model classes smaller. 17 18 So, this builder class should be used to create Markov models instead 19 of trying to initiate a Markov Model directly. 20 """ 21 # the default pseudo counts to use 22 DEFAULT_PSEUDO = 1 23
24 - def __init__(self, state_alphabet, emission_alphabet):
25 """Initialize a builder to create Markov Models. 26 27 Arguments: 28 29 o state_alphabet -- An alphabet containing all of the letters that 30 can appear in the states 31 32 o emission_alphabet -- An alphabet containing all of the letters for 33 states that can be emitted by the HMM. 34 """ 35 self._state_alphabet = state_alphabet 36 self._emission_alphabet = emission_alphabet 37 38 # the probabilities for transitions and emissions 39 # by default we have no transitions and all possible emissions 40 self.transition_prob = {} 41 self.emission_prob = self._all_blank(state_alphabet, 42 emission_alphabet) 43 44 # the default pseudocounts for transition and emission counting 45 self.transition_pseudo = {} 46 self.emission_pseudo = self._all_pseudo(state_alphabet, 47 emission_alphabet)
48
49 - def _all_blank(self, first_alphabet, second_alphabet):
50 """Return a dictionary with all counts set to zero. 51 52 This uses the letters in the first and second alphabet to create 53 a dictionary with keys of two tuples organized as 54 (letter of first alphabet, letter of second alphabet). The values 55 are all set to 0. 56 """ 57 all_blank = {} 58 for first_state in first_alphabet.letters: 59 for second_state in second_alphabet.letters: 60 all_blank[(first_state, second_state)] = 0 61 62 return all_blank
63
64 - def _all_pseudo(self, first_alphabet, second_alphabet):
65 """Return a dictionary with all counts set to a default value. 66 67 This takes the letters in first alphabet and second alphabet and 68 creates a dictionary with keys of two tuples organized as: 69 (letter of first alphabet, letter of second alphabet). The values 70 are all set to the value of the class attribute DEFAULT_PSEUDO. 71 """ 72 all_counts = {} 73 for first_state in first_alphabet.letters: 74 for second_state in second_alphabet.letters: 75 all_counts[(first_state, second_state)] = self.DEFAULT_PSEUDO 76 77 return all_counts
78
79 - def get_markov_model(self):
80 """Return the markov model corresponding with the current parameters. 81 82 Each markov model returned by a call to this function is unique 83 (ie. they don't influence each other). 84 """ 85 transition_prob = copy.deepcopy(self.transition_prob) 86 emission_prob = copy.deepcopy(self.emission_prob) 87 transition_pseudo = copy.deepcopy(self.transition_pseudo) 88 emission_pseudo = copy.deepcopy(self.emission_pseudo) 89 90 return HiddenMarkovModel(transition_prob, emission_prob, 91 transition_pseudo, emission_pseudo)
92
93 - def set_equal_probabilities(self):
94 """Reset all probabilities to be an average value. 95 96 This resets the values of all allowed transitions and all allowed 97 emissions to be equal to 1 divided by the number of possible elements. 98 99 This is useful if you just want to initialize a Markov Model to 100 starting values (ie. if you have no prior notions of what the 101 probabilities should be -- or if you are just feeling too lazy 102 to calculate them :-). 103 104 Warning 1 -- this will reset all currently set probabilities. 105 106 Warning 2 -- This just sets all probabilities for transitions and 107 emissions to total up to 1, so it doesn't ensure that the sum of 108 each set of transitions adds up to 1. 109 """ 110 # first set the transitions 111 new_trans_prob = float(1) / float(len(self.transition_prob.keys())) 112 for key in self.transition_prob.keys(): 113 self.transition_prob[key] = new_trans_prob 114 115 # now set the emissions 116 new_emission_prob = float(1) / float(len(self.emission_prob.keys())) 117 for key in self.emission_prob.keys(): 118 self.emission_prob[key] = new_emission_prob
119 120
121 - def set_random_probabilities(self):
122 """Set all probabilities to randomly generated numbers. 123 124 This will reset the value of all allowed transitions and emissions 125 to random values. 126 127 Warning 1 -- This will reset any currently set probabibilities. 128 129 Warning 2 -- This does not check to ensure that the sum of 130 all of the probabilities is less then 1. It just randomly assigns 131 a probability to each 132 """ 133 for key in self.transition_prob.keys(): 134 self.transition_prob[key] = random.random() 135 136 for key in self.emission_prob.keys(): 137 self.emission_prob[key] = random.random()
138 139 # --- functions to deal with the transitions in the sequence 140
141 - def allow_all_transitions(self):
142 """A convenience function to create transitions between all states. 143 144 By default all transitions within the alphabet are disallowed; this 145 is a way to change this to allow all possible transitions. 146 """ 147 # first get all probabilities and pseudo counts set 148 # to the default values 149 all_probs = self._all_blank(self._state_alphabet, 150 self._state_alphabet) 151 152 all_pseudo = self._all_pseudo(self._state_alphabet, 153 self._state_alphabet) 154 155 # now set any probabilities and pseudo counts that 156 # were previously set 157 for set_key in self.transition_prob.keys(): 158 all_probs[set_key] = self.transition_prob[set_key] 159 160 for set_key in self.transition_pseudo.keys(): 161 all_pseudo[set_key] = self.transition_pseudo[set_key] 162 163 # finally reinitialize the transition probs and pseudo counts 164 self.transition_prob = all_probs 165 self.transition_pseudo = all_pseudo
166
167 - def allow_transition(self, from_state, to_state, probability = None, 168 pseudocount = None):
169 """Set a transition as being possible between the two states. 170 171 probability and pseudocount are optional arguments 172 specifying the probabilities and pseudo counts for the transition. 173 If these are not supplied, then the values are set to the 174 default values. 175 176 Raises: 177 KeyError -- if the two states already have an allowed transition. 178 """ 179 # check the sanity of adding these states 180 for state in [from_state, to_state]: 181 assert state in self._state_alphabet, \ 182 "State %s was not found in the sequence alphabet" % state 183 184 # ensure that the states are not already set 185 if ((from_state, to_state) not in self.transition_prob.keys() and 186 (from_state, to_state) not in self.transition_pseudo.keys()): 187 # set the initial probability 188 if probability is None: 189 probability = 0 190 self.transition_prob[(from_state, to_state)] = probability 191 192 # set the initial pseudocounts 193 if pseudocount is None: 194 pseudcount = self.DEFAULT_PSEUDO 195 self.transition_pseudo[(from_state, to_state)] = pseudocount 196 else: 197 raise KeyError("Transtion from %s to %s is already allowed." 198 % (from_state, to_state))
199
200 - def destroy_transition(self, from_state, to_state):
201 """Restrict transitions between the two states. 202 203 Raises: 204 KeyError if the transition is not currently allowed. 205 """ 206 try: 207 del self.transition_prob[(from_state, to_state)] 208 del self.transition_pseudo[(from_state, to_state)] 209 except KeyError: 210 raise KeyError("Transition from %s to %s is already disallowed." 211 % (from_state, to_state))
212
213 - def set_transition_score(self, from_state, to_state, probability):
214 """Set the probability of a transition between two states. 215 216 Raises: 217 KeyError if the transition is not allowed. 218 """ 219 if self.transition_prob.has_key((from_state, to_state)): 220 self.transition_prob[(from_state, to_state)] = probability 221 else: 222 raise KeyError("Transition from %s to %s is not allowed." 223 % (from_state, to_state))
224
225 - def set_transition_pseudocount(self, from_state, to_state, count):
226 """Set the default pseudocount for a transition. 227 228 To avoid computational problems, it is helpful to be able to 229 set a 'default' pseudocount to start with for estimating 230 transition and emission probabilities (see p62 in Durbin et al 231 for more discussion on this. By default, all transitions have 232 a pseudocount of 1. 233 234 Raises: 235 KeyError if the transition is not allowed. 236 """ 237 if self.transition_pseudo.has_key((from_state, to_state)): 238 self.transition_pseudo[(from_state, to_state)] = count 239 else: 240 raise KeyError("Transition from %s to %s is not allowed." 241 % (from_state, to_state))
242 243 # --- functions to deal with emissions from the sequence 244
245 - def set_emission_score(self, seq_state, emission_state, probability):
246 """Set the probability of a emission from a particular state. 247 248 Raises: 249 KeyError if the emission from the given state is not allowed. 250 """ 251 if self.emission_prob.has_key((seq_state, emission_state)): 252 self.emission_prob[(seq_state, emission_state)] = probability 253 else: 254 raise KeyError("Emission of %s from %s is not allowed." 255 % (emission_state, seq_state))
256
257 - def set_emission_pseudocount(self, seq_state, emission_state, count):
258 """Set the default pseudocount for an emission. 259 260 To avoid computational problems, it is helpful to be able to 261 set a 'default' pseudocount to start with for estimating 262 transition and emission probabilities (see p62 in Durbin et al 263 for more discussion on this. By default, all emissions have 264 a pseudocount of 1. 265 266 Raises: 267 KeyError if the emission from the given state is not allowed. 268 """ 269 if self.emission_pseudo.has_key((seq_state, emission_state)): 270 self.emission_pseudo[(seq_state, emission_state)] = count 271 else: 272 raise KeyError("Emission of %s from %s is not allowed." 273 % (emission_state, seq_state))
274
275 -class HiddenMarkovModel:
276 """Represent a hidden markov model that can be used for state estimation. 277 """
278 - def __init__(self, transition_prob, emission_prob, transition_pseudo, 279 emission_pseudo):
280 """Initialize a Markov Model. 281 282 Note: You should use the MarkovModelBuilder class instead of 283 initiating this class directly. 284 285 Arguments: 286 287 o transition_prob -- A dictionary of transition probabilities for all 288 possible transitions in the sequence. 289 290 o emission_prob -- A dictionary of emissions probabilities for all 291 possible emissions from the sequence states. 292 293 o transition_pseudo -- Pseudo-counts to be used for the transitions, 294 when counting for purposes of estimating transition probabilities. 295 296 o emission_pseduo -- Pseudo-counts fo tbe used for the emissions, 297 when counting for purposes of estimating emission probabilities. 298 """ 299 self._transition_pseudo = transition_pseudo 300 self._emission_pseudo = emission_pseudo 301 302 self.transition_prob = transition_prob 303 self.emission_prob = emission_prob 304 305 # a dictionary of the possible transitions from one letter to the next 306 # the keys are the letter, and the values are lists of letters that 307 # can be transitioned to 308 self._transitions_from = \ 309 self._calculate_from_transitions(self.transition_prob)
310
311 - def _calculate_from_transitions(self, trans_probs):
312 """Calculate which 'from transitions' are allowed for each letter. 313 314 This looks through all of the trans_probs, and uses this dictionary 315 to determine allowed transitions. It converts this information into 316 a dictionary, whose keys are the transition letters and whose 317 values are a list of allowed letters to transition to. 318 """ 319 from_transitions = {} 320 321 # loop over all of the different transitions 322 for trans_key in trans_probs.keys(): 323 # if the letter to 'transition from' already exists, add the 324 # new letter which can be 'transitioned to' to the list 325 try: 326 from_transitions[trans_key[0]].append(trans_key[1]) 327 # otherwise create the list and add the letter 328 except KeyError: 329 from_transitions[trans_key[0]] = [] 330 from_transitions[trans_key[0]].append(trans_key[1]) 331 332 return from_transitions
333
334 - def get_blank_transitions(self):
335 """Get the default transitions for the model. 336 337 Returns a dictionary of all of the default transitions between any 338 two letters in the sequence alphabet. The dictionary is structured 339 with keys as (letter1, letter2) and values as the starting number 340 of transitions. 341 """ 342 return self._transition_pseudo
343
344 - def get_blank_emissions(self):
345 """Get the starting default emmissions for each sequence. 346 347 This returns a dictionary of the default emmissions for each 348 letter. The dictionary is structured with keys as 349 (seq_letter, emmission_letter) and values as the starting number 350 of emmissions. 351 """ 352 return self._emission_pseudo
353
354 - def transitions_from(self, state_letter):
355 """Get all transitions which can happen from the given state. 356 357 This returns all letters which the given state_letter is allowed 358 to transition to. An empty list is returned if no letters are possible. 359 """ 360 try: 361 return self._transitions_from[state_letter] 362 except KeyError: 363 return []
364
365 - def viterbi(self, sequence, state_alphabet):
366 """Calculate the most probable state path using the Viterbi algorithm. 367 368 This implements the Viterbi algorithm (see pgs 55-57 in Durbin et 369 al for a full explanation -- this is where I took my implementation 370 ideas from), to allow decoding of the state path, given a sequence 371 of emissions. 372 373 Arguments: 374 375 o sequence -- A Seq object with the emission sequence that we 376 want to decode. 377 378 o state_alphabet -- The alphabet of the possible state sequences 379 that can be generated. 380 """ 381 # calculate logarithms of the transition and emission probs 382 log_trans = self._log_transform(self.transition_prob) 383 log_emission = self._log_transform(self.emission_prob) 384 385 viterbi_probs = {} 386 pred_state_seq = {} 387 state_letters = state_alphabet.letters 388 # --- initialization 389 # 390 # NOTE: My index numbers are one less than what is given in Durbin 391 # et al, since we are indexing the sequence going from 0 to 392 # (Length - 1) not 1 to Length, like in Durbin et al. 393 # 394 # v_{0}(0) = 1 395 viterbi_probs[(state_letters[0], -1)] = 1 396 # v_{k}(0) = 0 for k > 0 397 for state_letter in state_letters[1:]: 398 viterbi_probs[(state_letter, -1)] = 0 399 400 # --- recursion 401 # loop over the training squence (i = 1 .. L) 402 for i in range(0, len(sequence)): 403 # now loop over all of the letters in the state path 404 for main_state in state_letters: 405 # e_{l}(x_{i}) 406 emission_part = log_emission[(main_state, sequence[i])] 407 408 # loop over all possible states 409 possible_state_probs = {} 410 for cur_state in self.transitions_from(main_state): 411 # a_{kl} 412 trans_part = log_trans[(cur_state, main_state)] 413 414 # v_{k}(i - 1) 415 viterbi_part = viterbi_probs[(cur_state, i - 1)] 416 cur_prob = viterbi_part + trans_part 417 418 possible_state_probs[cur_state] = cur_prob 419 420 # finally calculate the viterbi probability using the max 421 max_prob = max(possible_state_probs.values()) 422 viterbi_probs[(main_state, i)] = (emission_part + max_prob) 423 424 # now get the most likely state 425 for state in possible_state_probs.keys(): 426 if possible_state_probs[state] == max_prob: 427 pred_state_seq[(i - 1, main_state)] = state 428 break 429 430 # --- termination 431 # calculate the probability of the state path 432 # loop over all letters 433 all_probs = {} 434 for state in state_letters: 435 # v_{k}(L) 436 viterbi_part = viterbi_probs[(state, len(sequence) - 1)] 437 # a_{k0} 438 transition_part = log_trans[(state, state_letters[0])] 439 440 all_probs[state] = viterbi_part * transition_part 441 442 state_path_prob = max(all_probs.values()) 443 444 # find the last pointer we need to trace back from 445 last_state = '' 446 for state in all_probs.keys(): 447 if all_probs[state] == state_path_prob: 448 last_state = state 449 450 assert last_state != '', "Didn't find the last state to trace from!" 451 452 # --- traceback 453 traceback_seq = MutableSeq('', state_alphabet) 454 455 loop_seq = range(0, len(sequence)) 456 loop_seq.reverse() 457 458 cur_state = last_state 459 for i in loop_seq: 460 traceback_seq.append(cur_state) 461 462 cur_state = pred_state_seq[(i - 1, cur_state)] 463 464 # put the traceback sequence in the proper orientation 465 traceback_seq.reverse() 466 467 return traceback_seq.toseq(), state_path_prob
468
469 - def _log_transform(self, probability):
470 """Return log transform of the given probability dictionary. 471 472 When calculating the Viterbi equation, we need to deal with things 473 as sums of logs instead of products of probabilities, so that we 474 don't get underflow errors.. This copies the given probability 475 dictionary and returns the same dictionary with everything 476 transformed with a log. 477 """ 478 log_prob = copy.copy(probability) 479 480 for key in log_prob.keys(): 481 log_prob[key] = math.log(log_prob[key]) 482 483 return log_prob
484