1
2
3
4
5 """
6 This module provides code for doing logistic regressions.
7
8
9 Classes:
10 LogisticRegression Holds information for a LogisticRegression classifier.
11
12
13 Functions:
14 train Train a new classifier.
15 calculate Calculate the probabilities of each class, given an observation.
16 classify Classify an observation into a class.
17 """
18
19 import numpy
20 import numpy.linalg
21
23 """Holds information necessary to do logistic regression
24 classification.
25
26 Members:
27 beta List of the weights for each dimension.
28
29 """
31 """LogisticRegression()"""
32 self.beta = []
33
34 -def train(xs, ys, update_fn=None, typecode=None):
35 """train(xs, ys[, update_fn]) -> LogisticRegression
36
37 Train a logistic regression classifier on a training set. xs is a
38 list of observations and ys is a list of the class assignments,
39 which should be 0 or 1. xs and ys should contain the same number
40 of elements. update_fn is an optional callback function that
41 takes as parameters that iteration number and log likelihood.
42
43 """
44 if len(xs) != len(ys):
45 raise ValueError("xs and ys should be the same length.")
46 classes = set(ys)
47 if classes != set([0, 1]):
48 raise ValueError("Classes should be 0's and 1's")
49 if typecode is None:
50 typecode = 'd'
51
52
53
54 N, ndims = len(xs), len(xs[0]) + 1
55 if N==0 or ndims==1:
56 raise ValueError("No observations or observation of 0 dimension.")
57
58
59 X = numpy.ones((N, ndims), typecode)
60 X[:, 1:] = xs
61 Xt = numpy.transpose(X)
62 y = numpy.asarray(ys, typecode)
63
64
65 beta = numpy.zeros(ndims, typecode)
66
67 MAX_ITERATIONS = 500
68 CONVERGE_THRESHOLD = 0.01
69 stepsize = 1.0
70
71
72 iter = 0
73 old_beta = old_llik = None
74 while iter < MAX_ITERATIONS:
75
76 ebetaX = numpy.exp(numpy.dot(beta, Xt))
77 p = ebetaX / (1+ebetaX)
78
79
80 logp = y*numpy.log(p) + (1-y)*numpy.log(1-p)
81 llik = sum(logp)
82 if update_fn is not None:
83 update_fn(iter, llik)
84
85
86 if llik < old_llik:
87 stepsize = stepsize / 2.0
88 beta = old_beta
89
90 if old_llik is not None and numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD:
91 break
92 old_llik, old_beta = llik, beta
93 iter += 1
94
95 W = numpy.identity(N) * p
96 Xtyp = numpy.dot(Xt, y-p)
97 XtWX = numpy.dot(numpy.dot(Xt, W), X)
98
99
100
101 delta = numpy.linalg.solve(XtWX, Xtyp)
102 if numpy.fabs(stepsize-1.0) > 0.001:
103 delta = delta * stepsize
104 beta = beta + delta
105 else:
106 raise RuntimeError("Didn't converge.")
107
108 lr = LogisticRegression()
109 lr.beta = map(float, beta)
110 return lr
111
113 """calculate(lr, x) -> list of probabilities
114
115 Calculate the probability for each class. lr is a
116 LogisticRegression object. x is the observed data. Returns a
117 list of the probability that it fits each class.
118
119 """
120
121 x = numpy.asarray([1.0] + x)
122
123 ebetaX = numpy.exp(numpy.dot(lr.beta, x))
124 p = ebetaX / (1+ebetaX)
125 return [1-p, p]
126
128 """classify(lr, x) -> 1 or 0
129
130 Classify an observation into a class.
131
132 """
133 probs = calculate(lr, x)
134 if probs[0] > probs[1]:
135 return 0
136 return 1
137