1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.math.stat.regression;
18 import java.io.Serializable;
19
20 import org.apache.commons.math.MathException;
21 import org.apache.commons.math.distribution.DistributionFactory;
22 import org.apache.commons.math.distribution.TDistribution;
23
24 /**
25 * Estimates an ordinary least squares regression model
26 * with one independent variable.
27 * <p>
28 * <code> y = intercept + slope * x </code>
29 * <p>
30 * Standard errors for <code>intercept</code> and <code>slope</code> are
31 * available as well as ANOVA, r-square and Pearson's r statistics.
32 * <p>
33 * Observations (x,y pairs) can be added to the model one at a time or they
34 * can be provided in a 2-dimensional array. The observations are not stored
35 * in memory, so there is no limit to the number of observations that can be
36 * added to the model.
37 * <p>
38 * <strong>Usage Notes</strong>: <ul>
39 * <li> When there are fewer than two observations in the model, or when
40 * there is no variation in the x values (i.e. all x values are the same)
41 * all statistics return <code>NaN</code>. At least two observations with
42 * different x coordinates are requred to estimate a bivariate regression
43 * model.
44 * </li>
45 * <li> getters for the statistics always compute values based on the current
46 * set of observations -- i.e., you can get statistics, then add more data
47 * and get updated statistics without using a new instance. There is no
48 * "compute" method that updates all statistics. Each of the getters performs
49 * the necessary computations to return the requested statistic.</li>
50 * </ul>
51 *
52 * @version $Revision: 348519 $ $Date: 2005-11-23 12:12:18 -0700 (Wed, 23 Nov 2005) $
53 */
54 public class SimpleRegression implements Serializable {
55
56 /** Serializable version identifier */
57 private static final long serialVersionUID = -3004689053607543335L;
58
59 /** sum of x values */
60 private double sumX = 0d;
61
62 /** total variation in x (sum of squared deviations from xbar) */
63 private double sumXX = 0d;
64
65 /** sum of y values */
66 private double sumY = 0d;
67
68 /** total variation in y (sum of squared deviations from ybar) */
69 private double sumYY = 0d;
70
71 /** sum of products */
72 private double sumXY = 0d;
73
74 /** number of observations */
75 private long n = 0;
76
77 /** mean of accumulated x values, used in updating formulas */
78 private double xbar = 0;
79
80 /** mean of accumulated y values, used in updating formulas */
81 private double ybar = 0;
82
83
84
85 /**
86 * Create an empty SimpleRegression instance
87 */
88 public SimpleRegression() {
89 super();
90 }
91
92 /**
93 * Adds the observation (x,y) to the regression data set.
94 * <p>
95 * Uses updating formulas for means and sums of squares defined in
96 * "Algorithms for Computing the Sample Variance: Analysis and
97 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
98 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
99 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985
100 *
101 *
102 * @param x independent variable value
103 * @param y dependent variable value
104 */
105 public void addData(double x, double y) {
106 if (n == 0) {
107 xbar = x;
108 ybar = y;
109 } else {
110 double dx = x - xbar;
111 double dy = y - ybar;
112 sumXX += dx * dx * (double) n / (double) (n + 1.0);
113 sumYY += dy * dy * (double) n / (double) (n + 1.0);
114 sumXY += dx * dy * (double) n / (double) (n + 1.0);
115 xbar += dx / (double) (n + 1.0);
116 ybar += dy / (double) (n + 1.0);
117 }
118 sumX += x;
119 sumY += y;
120 n++;
121 }
122
123 /**
124 * Adds the observations represented by the elements in
125 * <code>data</code>.
126 * <p>
127 * <code>(data[0][0],data[0][1])</code> will be the first observation, then
128 * <code>(data[1][0],data[1][1])</code>, etc.
129 * <p>
130 * This method does not replace data that has already been added. The
131 * observations represented by <code>data</code> are added to the existing
132 * dataset.
133 * <p>
134 * To replace all data, use <code>clear()</code> before adding the new
135 * data.
136 *
137 * @param data array of observations to be added
138 */
139 public void addData(double[][] data) {
140 for (int i = 0; i < data.length; i++) {
141 addData(data[i][0], data[i][1]);
142 }
143 }
144
145 /**
146 * Clears all data from the model.
147 */
148 public void clear() {
149 sumX = 0d;
150 sumXX = 0d;
151 sumY = 0d;
152 sumYY = 0d;
153 sumXY = 0d;
154 n = 0;
155 }
156
157 /**
158 * Returns the number of observations that have been added to the model.
159 *
160 * @return n number of observations that have been added.
161 */
162 public long getN() {
163 return n;
164 }
165
166 /**
167 * Returns the "predicted" <code>y</code> value associated with the
168 * supplied <code>x</code> value, based on the data that has been
169 * added to the model when this method is activated.
170 * <p>
171 * <code> predict(x) = intercept + slope * x </code>
172 * <p>
173 * <strong>Preconditions</strong>: <ul>
174 * <li>At least two observations (with at least two different x values)
175 * must have been added before invoking this method. If this method is
176 * invoked before a model can be estimated, <code>Double,NaN</code> is
177 * returned.
178 * </li></ul>
179 *
180 * @param x input <code>x</code> value
181 * @return predicted <code>y</code> value
182 */
183 public double predict(double x) {
184 double b1 = getSlope();
185 return getIntercept(b1) + b1 * x;
186 }
187
188 /**
189 * Returns the intercept of the estimated regression line.
190 * <p>
191 * The least squares estimate of the intercept is computed using the
192 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
193 * The intercept is sometimes denoted b0.
194 * <p>
195 * <strong>Preconditions</strong>: <ul>
196 * <li>At least two observations (with at least two different x values)
197 * must have been added before invoking this method. If this method is
198 * invoked before a model can be estimated, <code>Double,NaN</code> is
199 * returned.
200 * </li></ul>
201 *
202 * @return the intercept of the regression line
203 */
204 public double getIntercept() {
205 return getIntercept(getSlope());
206 }
207
208 /**
209 * Returns the slope of the estimated regression line.
210 * <p>
211 * The least squares estimate of the slope is computed using the
212 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
213 * The slope is sometimes denoted b1.
214 * <p>
215 * <strong>Preconditions</strong>: <ul>
216 * <li>At least two observations (with at least two different x values)
217 * must have been added before invoking this method. If this method is
218 * invoked before a model can be estimated, <code>Double.NaN</code> is
219 * returned.
220 * </li></ul>
221 *
222 * @return the slope of the regression line
223 */
224 public double getSlope() {
225 if (n < 2) {
226 return Double.NaN;
227 }
228 if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {
229 return Double.NaN;
230 }
231 return sumXY / sumXX;
232 }
233
234 /**
235 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
236 * sum of squared errors</a> (SSE) associated with the regression
237 * model.
238 * <p>
239 * <strong>Preconditions</strong>: <ul>
240 * <li>At least two observations (with at least two different x values)
241 * must have been added before invoking this method. If this method is
242 * invoked before a model can be estimated, <code>Double,NaN</code> is
243 * returned.
244 * </li></ul>
245 *
246 * @return sum of squared errors associated with the regression model
247 */
248 public double getSumSquaredErrors() {
249 return getSumSquaredErrors(getSlope());
250 }
251
252 /**
253 * Returns the sum of squared deviations of the y values about their mean.
254 * <p>
255 * This is defined as SSTO
256 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.
257 * <p>
258 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.
259 *
260 * @return sum of squared deviations of y values
261 */
262 public double getTotalSumSquares() {
263 if (n < 2) {
264 return Double.NaN;
265 }
266 return sumYY;
267 }
268
269 /**
270 * Returns the sum of squared deviations of the predicted y values about
271 * their mean (which equals the mean of y).
272 * <p>
273 * This is usually abbreviated SSR or SSM. It is defined as SSM
274 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>
275 * <p>
276 * <strong>Preconditions</strong>: <ul>
277 * <li>At least two observations (with at least two different x values)
278 * must have been added before invoking this method. If this method is
279 * invoked before a model can be estimated, <code>Double.NaN</code> is
280 * returned.
281 * </li></ul>
282 *
283 * @return sum of squared deviations of predicted y values
284 */
285 public double getRegressionSumSquares() {
286 return getRegressionSumSquares(getSlope());
287 }
288
289 /**
290 * Returns the sum of squared errors divided by the degrees of freedom,
291 * usually abbreviated MSE.
292 * <p>
293 * If there are fewer than <strong>three</strong> data pairs in the model,
294 * or if there is no variation in <code>x</code>, this returns
295 * <code>Double.NaN</code>.
296 *
297 * @return sum of squared deviations of y values
298 */
299 public double getMeanSquareError() {
300 if (n < 3) {
301 return Double.NaN;
302 }
303 return getSumSquaredErrors() / (double) (n - 2);
304 }
305
306 /**
307 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
308 * Pearson's product moment correlation coefficient</a>,
309 * usually denoted r.
310 * <p>
311 * <strong>Preconditions</strong>: <ul>
312 * <li>At least two observations (with at least two different x values)
313 * must have been added before invoking this method. If this method is
314 * invoked before a model can be estimated, <code>Double,NaN</code> is
315 * returned.
316 * </li></ul>
317 *
318 * @return Pearson's r
319 */
320 public double getR() {
321 double b1 = getSlope();
322 double result = Math.sqrt(getRSquare(b1));
323 if (b1 < 0) {
324 result = -result;
325 }
326 return result;
327 }
328
329 /**
330 * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
331 * coefficient of determination</a>,
332 * usually denoted r-square.
333 * <p>
334 * <strong>Preconditions</strong>: <ul>
335 * <li>At least two observations (with at least two different x values)
336 * must have been added before invoking this method. If this method is
337 * invoked before a model can be estimated, <code>Double,NaN</code> is
338 * returned.
339 * </li></ul>
340 *
341 * @return r-square
342 */
343 public double getRSquare() {
344 return getRSquare(getSlope());
345 }
346
347 /**
348 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
349 * standard error of the intercept estimate</a>,
350 * usually denoted s(b0).
351 * <p>
352 * If there are fewer that <strong>three</strong> observations in the
353 * model, or if there is no variation in x, this returns
354 * <code>Double.NaN</code>.
355 *
356 * @return standard error associated with intercept estimate
357 */
358 public double getInterceptStdErr() {
359 return Math.sqrt(
360 getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
361 }
362
363 /**
364 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
365 * error of the slope estimate</a>,
366 * usually denoted s(b1).
367 * <p>
368 * If there are fewer that <strong>three</strong> data pairs in the model,
369 * or if there is no variation in x, this returns <code>Double.NaN</code>.
370 *
371 * @return standard error associated with slope estimate
372 */
373 public double getSlopeStdErr() {
374 return Math.sqrt(getMeanSquareError() / sumXX);
375 }
376
377 /**
378 * Returns the half-width of a 95% confidence interval for the slope
379 * estimate.
380 * <p>
381 * The 95% confidence interval is
382 * <p>
383 * <code>(getSlope() - getSlopeConfidenceInterval(),
384 * getSlope() + getSlopeConfidenceInterval())</code>
385 * <p>
386 * If there are fewer that <strong>three</strong> observations in the
387 * model, or if there is no variation in x, this returns
388 * <code>Double.NaN</code>.
389 * <p>
390 * <strong>Usage Note</strong>:<br>
391 * The validity of this statistic depends on the assumption that the
392 * observations included in the model are drawn from a
393 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
394 * Bivariate Normal Distribution</a>.
395 *
396 * @return half-width of 95% confidence interval for the slope estimate
397 *
398 * @throws MathException if the confidence interval can not be computed.
399 */
400 public double getSlopeConfidenceInterval() throws MathException {
401 return getSlopeConfidenceInterval(0.05d);
402 }
403
404 /**
405 * Returns the half-width of a (100-100*alpha)% confidence interval for
406 * the slope estimate.
407 * <p>
408 * The (100-100*alpha)% confidence interval is
409 * <p>
410 * <code>(getSlope() - getSlopeConfidenceInterval(),
411 * getSlope() + getSlopeConfidenceInterval())</code>
412 * <p>
413 * To request, for example, a 99% confidence interval, use
414 * <code>alpha = .01</code>
415 * <p>
416 * <strong>Usage Note</strong>:<br>
417 * The validity of this statistic depends on the assumption that the
418 * observations included in the model are drawn from a
419 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
420 * Bivariate Normal Distribution</a>.
421 * <p>
422 * <strong> Preconditions:</strong><ul>
423 * <li>If there are fewer that <strong>three</strong> observations in the
424 * model, or if there is no variation in x, this returns
425 * <code>Double.NaN</code>.
426 * </li>
427 * <li><code>(0 < alpha < 1)</code>; otherwise an
428 * <code>IllegalArgumentException</code> is thrown.
429 * </li></ul>
430 *
431 * @param alpha the desired significance level
432 * @return half-width of 95% confidence interval for the slope estimate
433 * @throws MathException if the confidence interval can not be computed.
434 */
435 public double getSlopeConfidenceInterval(double alpha)
436 throws MathException {
437 if (alpha >= 1 || alpha <= 0) {
438 throw new IllegalArgumentException();
439 }
440 return getSlopeStdErr() *
441 getTDistribution().inverseCumulativeProbability(1d - alpha / 2d);
442 }
443
444 /**
445 * Returns the significance level of the slope (equiv) correlation.
446 * <p>
447 * Specifically, the returned value is the smallest <code>alpha</code>
448 * such that the slope confidence interval with significance level
449 * equal to <code>alpha</code> does not include <code>0</code>.
450 * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
451 * <p>
452 * <strong>Usage Note</strong>:<br>
453 * The validity of this statistic depends on the assumption that the
454 * observations included in the model are drawn from a
455 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
456 * Bivariate Normal Distribution</a>.
457 * <p>
458 * If there are fewer that <strong>three</strong> observations in the
459 * model, or if there is no variation in x, this returns
460 * <code>Double.NaN</code>.
461 *
462 * @return significance level for slope/correlation
463 * @throws MathException if the significance level can not be computed.
464 */
465 public double getSignificance() throws MathException {
466 return 2d* (1.0 - getTDistribution().cumulativeProbability(
467 Math.abs(getSlope()) / getSlopeStdErr()));
468 }
469
470
471
472 /**
473 * Returns the intercept of the estimated regression line, given the slope.
474 * <p>
475 * Will return <code>NaN</code> if slope is <code>NaN</code>.
476 *
477 * @param slope current slope
478 * @return the intercept of the regression line
479 */
480 private double getIntercept(double slope) {
481 return (sumY - slope * sumX) / ((double) n);
482 }
483
484 /**
485 * Returns the sum of squared errors associated with the regression
486 * model, using the slope of the regression line.
487 * <p>
488 * Returns NaN if the slope is NaN.
489 *
490 * @param b1 current slope
491 * @return sum of squared errors associated with the regression model
492 */
493 private double getSumSquaredErrors(double b1) {
494 return sumYY - sumXY * sumXY / sumXX;
495 }
496
497 /**
498 * Computes r-square from the slope.
499 * <p>
500 * will return NaN if slope is Nan.
501 *
502 * @param b1 current slope
503 * @return r-square
504 */
505 private double getRSquare(double b1) {
506 double ssto = getTotalSumSquares();
507 return (ssto - getSumSquaredErrors(b1)) / ssto;
508 }
509
510 /**
511 * Computes SSR from b1.
512 *
513 * @param slope regression slope estimate
514 * @return sum of squared deviations of predicted y values
515 */
516 private double getRegressionSumSquares(double slope) {
517 return slope * slope * sumXX;
518 }
519
520 /**
521 * Uses distribution framework to get a t distribution instance
522 * with df = n - 2
523 *
524 * @return t distribution with df = n - 2
525 */
526 private TDistribution getTDistribution() {
527 return DistributionFactory.newInstance().createTDistribution(n - 2);
528 }
529 }