Package Bio :: Module NaiveBayes
[hide private]
[frames] | no frames]

Source Code for Module Bio.NaiveBayes

  1  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """This provides code for a general Naive Bayes learner. 
  7   
  8  Naive Bayes is a supervised classification algorithm that uses Bayes 
  9  rule to compute the fit between a new observation and some previously 
 10  observed data.  The observations are discrete feature vectors, with 
 11  the Bayes assumption that the features are independent.  Although this 
 12  is hardly ever true, the classifier works well enough in practice. 
 13   
 14  Glossary: 
 15  observation    A feature vector of discrete data. 
 16  class          A possible classification for an observation. 
 17   
 18   
 19  Classes: 
 20  NaiveBayes     Holds information for a naive Bayes classifier. 
 21   
 22  Functions: 
 23  train          Train a new naive Bayes classifier. 
 24  calculate      Calculate the probabilities of each class, given an observation. 
 25  classify       Classify an observation into a class. 
 26   
 27  """ 
 28   
 29  import numpy 
 30   
 31  #TODO - Remove this work around once we drop python 2.3 support 
 32  try: 
 33      set 
 34  except NameError: 
 35      from sets import Set as set 
 36   
37 -def _contents(items):
38 term = 1.0/len(items) 39 counts = {} 40 for item in items: 41 counts[item] = counts.get(item,0) + term 42 return counts
43
44 -class NaiveBayes:
45 """Holds information for a NaiveBayes classifier. 46 47 Members: 48 classes List of the possible classes of data. 49 p_conditional CLASS x DIM array of dicts of value -> P(value|class,dim) 50 p_prior List of the prior probabilities for every class. 51 dimensionality Dimensionality of the data. 52 53 """
54 - def __init__(self):
55 self.classes = [] 56 self.p_conditional = None 57 self.p_prior = [] 58 self.dimensionality = None
59
60 -def calculate(nb, observation, scale=0):
61 """calculate(nb, observation[, scale]) -> probability dict 62 63 Calculate log P(class|observation) for each class. nb is a NaiveBayes 64 classifier that has been trained. observation is a list representing 65 the observed data. scale is whether the probability should be 66 scaled by P(observation). By default, no scaling is done. The return 67 value is a dictionary where the keys is the class and the value is the 68 log probability of the class. 69 70 """ 71 # P(class|observation) = P(observation|class)*P(class)/P(observation) 72 # Taking the log: 73 # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation) 74 75 # Make sure the observation has the right dimensionality. 76 if len(observation) != nb.dimensionality: 77 raise ValueError("observation in %d dimension, but classifier in %d" \ 78 % (len(observation), nb.dimensionality)) 79 80 # Calculate log P(observation|class) for every class. 81 n = len(nb.classes) 82 lp_observation_class = numpy.zeros(n) # array of log P(observation|class) 83 for i in range(n): 84 # log P(observation|class) = SUM_i log P(observation_i|class) 85 probs = [None] * len(observation) 86 for j in range(len(observation)): 87 probs[j] = nb.p_conditional[i][j].get(observation[j], 0) 88 lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300)) 89 lp_observation_class[i] = sum(lprobs) 90 91 # Calculate log P(class). 92 lp_prior = numpy.log(nb.p_prior) 93 94 # Calculate log P(observation). 95 lp_observation = 0.0 # P(observation) 96 if scale: # Only calculate this if requested. 97 # log P(observation) = log SUM_i P(observation|class_i)P(class_i) 98 obs = numpy.exp(numpy.clip(lp_prior+lp_observation_class,-700,+700)) 99 lp_observation = numpy.log(sum(obs)) 100 101 # Calculate log P(class|observation). 102 lp_class_observation = {} # Dict of class : log P(class|observation) 103 for i in range(len(nb.classes)): 104 lp_class_observation[nb.classes[i]] = \ 105 lp_observation_class[i] + lp_prior[i] - lp_observation 106 107 return lp_class_observation
108
109 -def classify(nb, observation):
110 """classify(nb, observation) -> class 111 112 Classify an observation into a class. 113 114 """ 115 # The class is the one with the highest probability. 116 probs = calculate(nb, observation, scale=0) 117 max_prob = max_class = None 118 for klass in nb.classes: 119 if max_prob is None or probs[klass] > max_prob: 120 max_prob, max_class = probs[klass], klass 121 return max_class
122
123 -def train(training_set, results, priors=None, typecode=None):
124 """train(training_set, results[, priors]) -> NaiveBayes 125 126 Train a naive bayes classifier on a training set. training_set is a 127 list of observations. results is a list of the class assignments 128 for each observation. Thus, training_set and results must be the same 129 length. priors is an optional dictionary specifying the prior 130 probabilities for each type of result. If not specified, the priors 131 will be estimated from the training results. 132 133 """ 134 if not len(training_set): 135 raise ValueError("No data in the training set.") 136 if len(training_set) != len(results): 137 raise ValueError("training_set and results should be parallel lists.") 138 139 # If no typecode is specified, try to pick a reasonable one. If 140 # training_set is a Numeric array, then use that typecode. 141 # Otherwise, choose a reasonable default. 142 # XXX NOT IMPLEMENTED 143 144 # Check to make sure each vector in the training set has the same 145 # dimensionality. 146 dimensions = [len(x) for x in training_set] 147 if min(dimensions) != max(dimensions): 148 raise ValueError("observations have different dimensionality") 149 150 nb = NaiveBayes() 151 nb.dimensionality = dimensions[0] 152 153 # Get a list of all the classes, and 154 # estimate the prior probabilities for the classes. 155 if priors is not None: 156 percs = priors 157 nb.classes = list(set(results)) 158 else: 159 class_freq = _contents(results) 160 nb.classes = class_freq.keys() 161 percs = class_freq 162 nb.classes.sort() # keep it tidy 163 164 nb.p_prior = numpy.zeros(len(nb.classes)) 165 for i in range(len(nb.classes)): 166 nb.p_prior[i] = percs[nb.classes[i]] 167 168 # Collect all the observations in class. For each class, make a 169 # matrix of training instances versus dimensions. I might be able 170 # to optimize this with Numeric, if the training_set parameter 171 # were guaranteed to be a matrix. However, this may not be the 172 # case, because the client may be hacking up a sparse matrix or 173 # something. 174 c2i = {} # class to index of class 175 for index, key in enumerate(nb.classes): 176 c2i[key] = index 177 observations = [[] for c in nb.classes] # separate observations by class 178 for i in range(len(results)): 179 klass, obs = results[i], training_set[i] 180 observations[c2i[klass]].append(obs) 181 # Now make the observations Numeric matrics. 182 for i in range(len(observations)): 183 # XXX typecode must be specified! 184 observations[i] = numpy.asarray(observations[i], typecode) 185 186 # Calculate P(value|class,dim) for every class. 187 # This is a good loop to optimize. 188 nb.p_conditional = [] 189 for i in range(len(nb.classes)): 190 class_observations = observations[i] # observations for this class 191 nb.p_conditional.append([None] * nb.dimensionality) 192 for j in range(nb.dimensionality): 193 # Collect all the values in this dimension. 194 values = class_observations[:, j] 195 196 # Add pseudocounts here. This needs to be parameterized. 197 #values = list(values) + range(len(nb.classes)) # XXX add 1 198 199 # Estimate P(value|class,dim) 200 nb.p_conditional[i][j] = _contents(values) 201 return nb
202 203 if __name__ == "__main__" : 204 # Car data from example 'Naive Bayes Classifier example' by Eric Meisner November 22, 2003 205 # http://www.inf.u-szeged.hu/~ormandi/teaching/mi2/02-naiveBayes-example.pdf 206 xcar=[ 207 ['Red', 'Sports', 'Domestic'], 208 ['Red', 'Sports', 'Domestic'], 209 ['Red', 'Sports', 'Domestic'], 210 ['Yellow', 'Sports', 'Domestic'], 211 ['Yellow', 'Sports', 'Imported'], 212 ['Yellow', 'SUV', 'Imported'], 213 ['Yellow', 'SUV', 'Imported'], 214 ['Yellow', 'SUV', 'Domestic'], 215 ['Red', 'SUV', 'Imported'], 216 ['Red', 'Sports', 'Imported'] 217 ] 218 219 ycar=[ 220 'Yes', 221 'No', 222 'Yes', 223 'No', 224 'Yes', 225 'No', 226 'Yes', 227 'No', 228 'No', 229 'Yes' 230 ] 231 232 carmodel = train(xcar, ycar) 233 carresult = classify(carmodel, ['Red', 'Sports', 'Domestic']) 234 print 'Is Yes?', carresult 235 carresult = classify(carmodel, ['Red', 'SUV', 'Domestic']) 236 print 'Is No?', carresult 237