View Javadoc

1   /*
2    * Copyright 2003-2004 The Apache Software Foundation.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.commons.math.stat.regression;
18  import java.io.Serializable;
19  
20  import org.apache.commons.math.MathException;
21  import org.apache.commons.math.distribution.DistributionFactory;
22  import org.apache.commons.math.distribution.TDistribution;
23  
24  /**
25   * Estimates an ordinary least squares regression model
26   * with one independent variable.
27   * <p>
28   * <code> y = intercept + slope * x  </code>
29   * <p>
30   * Standard errors for <code>intercept</code> and <code>slope</code> are 
31   * available as well as ANOVA, r-square and Pearson's r statistics.
32   * <p>
33   * Observations (x,y pairs) can be added to the model one at a time or they 
34   * can be provided in a 2-dimensional array.  The observations are not stored
35   * in memory, so there is no limit to the number of observations that can be
36   * added to the model. 
37   * <p>
38   * <strong>Usage Notes</strong>: <ul>
39   * <li> When there are fewer than two observations in the model, or when
40   * there is no variation in the x values (i.e. all x values are the same) 
41   * all statistics return <code>NaN</code>. At least two observations with
42   * different x coordinates are requred to estimate a bivariate regression 
43   * model.
44   * </li>
45   * <li> getters for the statistics always compute values based on the current
46   * set of observations -- i.e., you can get statistics, then add more data
47   * and get updated statistics without using a new instance.  There is no 
48   * "compute" method that updates all statistics.  Each of the getters performs
49   * the necessary computations to return the requested statistic.</li>
50   * </ul>
51   *
52   * @version $Revision: 348519 $ $Date: 2005-11-23 12:12:18 -0700 (Wed, 23 Nov 2005) $
53   */
54  public class SimpleRegression implements Serializable {
55  
56      /** Serializable version identifier */
57      private static final long serialVersionUID = -3004689053607543335L;
58  
59      /** sum of x values */
60      private double sumX = 0d;
61  
62      /** total variation in x (sum of squared deviations from xbar) */
63      private double sumXX = 0d;
64  
65      /** sum of y values */
66      private double sumY = 0d;
67  
68      /** total variation in y (sum of squared deviations from ybar) */
69      private double sumYY = 0d;
70  
71      /** sum of products */
72      private double sumXY = 0d;
73  
74      /** number of observations */
75      private long n = 0;
76  
77      /** mean of accumulated x values, used in updating formulas */
78      private double xbar = 0;
79  
80      /** mean of accumulated y values, used in updating formulas */
81      private double ybar = 0;
82  
83      // ---------------------Public methods--------------------------------------
84  
85      /**
86       * Create an empty SimpleRegression instance
87       */
88      public SimpleRegression() {
89          super();
90      }
91      
92      /**
93       * Adds the observation (x,y) to the regression data set.
94       * <p>
95       * Uses updating formulas for means and sums of squares defined in 
96       * "Algorithms for Computing the Sample Variance: Analysis and
97       * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 
98       * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
99       * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985
100      *
101      *
102      * @param x independent variable value
103      * @param y dependent variable value
104      */
105     public void addData(double x, double y) {
106         if (n == 0) {
107             xbar = x;
108             ybar = y;
109         } else {
110             double dx = x - xbar;
111             double dy = y - ybar;
112             sumXX += dx * dx * (double) n / (double) (n + 1.0);
113             sumYY += dy * dy * (double) n / (double) (n + 1.0);
114             sumXY += dx * dy * (double) n / (double) (n + 1.0);
115             xbar += dx / (double) (n + 1.0);
116             ybar += dy / (double) (n + 1.0);
117         }
118         sumX += x;
119         sumY += y;
120         n++;
121     }
122 
123     /**
124      * Adds the observations represented by the elements in 
125      * <code>data</code>.
126      * <p>
127      * <code>(data[0][0],data[0][1])</code> will be the first observation, then
128      * <code>(data[1][0],data[1][1])</code>, etc. 
129      * <p> 
130      * This method does not replace data that has already been added.  The
131      * observations represented by <code>data</code> are added to the existing
132      * dataset.
133      * <p> 
134      * To replace all data, use <code>clear()</code> before adding the new 
135      * data.
136      * 
137      * @param data array of observations to be added
138      */
139     public void addData(double[][] data) {
140         for (int i = 0; i < data.length; i++) {
141             addData(data[i][0], data[i][1]);
142         }
143     }
144 
145     /**
146      * Clears all data from the model.
147      */
148     public void clear() {
149         sumX = 0d;
150         sumXX = 0d;
151         sumY = 0d;
152         sumYY = 0d;
153         sumXY = 0d;
154         n = 0;
155     }
156 
157     /**
158      * Returns the number of observations that have been added to the model.
159      *
160      * @return n number of observations that have been added.
161      */
162     public long getN() {
163         return n;
164     }
165 
166     /**
167      * Returns the "predicted" <code>y</code> value associated with the 
168      * supplied <code>x</code> value,  based on the data that has been
169      * added to the model when this method is activated.
170      * <p>
171      * <code> predict(x) = intercept + slope * x </code>
172      * <p>
173      * <strong>Preconditions</strong>: <ul>
174      * <li>At least two observations (with at least two different x values)
175      * must have been added before invoking this method. If this method is 
176      * invoked before a model can be estimated, <code>Double,NaN</code> is
177      * returned.
178      * </li></ul>
179      *
180      * @param x input <code>x</code> value
181      * @return predicted <code>y</code> value
182      */
183     public double predict(double x) {
184         double b1 = getSlope();
185         return getIntercept(b1) + b1 * x;
186     }
187 
188     /**
189      * Returns the intercept of the estimated regression line.
190      * <p>
191      * The least squares estimate of the intercept is computed using the 
192      * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
193      * The intercept is sometimes denoted b0. 
194      * <p>
195      * <strong>Preconditions</strong>: <ul>
196      * <li>At least two observations (with at least two different x values)
197      * must have been added before invoking this method. If this method is 
198      * invoked before a model can be estimated, <code>Double,NaN</code> is
199      * returned.
200      * </li></ul>
201      *
202      * @return the intercept of the regression line
203      */
204     public double getIntercept() {
205         return getIntercept(getSlope());
206     }
207 
208     /**
209     * Returns the slope of the estimated regression line.  
210     * <p>
211     * The least squares estimate of the slope is computed using the 
212     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
213     * The slope is sometimes denoted b1. 
214     * <p>
215     * <strong>Preconditions</strong>: <ul>
216     * <li>At least two observations (with at least two different x values)
217     * must have been added before invoking this method. If this method is 
218     * invoked before a model can be estimated, <code>Double.NaN</code> is
219     * returned.
220     * </li></ul>
221     *
222     * @return the slope of the regression line
223     */
224     public double getSlope() {
225         if (n < 2) {
226             return Double.NaN; //not enough data 
227         }
228         if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {
229             return Double.NaN; //not enough variation in x
230         }
231         return sumXY / sumXX;
232     }
233 
234     /**
235      * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
236      * sum of squared errors</a> (SSE) associated with the regression 
237      * model.
238      * <p>
239      * <strong>Preconditions</strong>: <ul>
240      * <li>At least two observations (with at least two different x values)
241      * must have been added before invoking this method. If this method is 
242      * invoked before a model can be estimated, <code>Double,NaN</code> is
243      * returned.
244      * </li></ul>
245      *
246      * @return sum of squared errors associated with the regression model
247      */
248     public double getSumSquaredErrors() {
249         return getSumSquaredErrors(getSlope());
250     }
251 
252     /**
253      * Returns the sum of squared deviations of the y values about their mean.
254      * <p>
255      * This is defined as SSTO 
256      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.
257      * <p>
258      * If <code>n < 2</code>, this returns <code>Double.NaN</code>.
259      *
260      * @return sum of squared deviations of y values
261      */
262     public double getTotalSumSquares() {
263         if (n < 2) {
264             return Double.NaN;
265         }
266         return sumYY;
267     }
268 
269     /**
270      * Returns the sum of squared deviations of the predicted y values about 
271      * their mean (which equals the mean of y).
272      * <p>
273      * This is usually abbreviated SSR or SSM.  It is defined as SSM 
274      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>
275      * <p>
276      * <strong>Preconditions</strong>: <ul>
277      * <li>At least two observations (with at least two different x values)
278      * must have been added before invoking this method. If this method is 
279      * invoked before a model can be estimated, <code>Double.NaN</code> is
280      * returned.
281      * </li></ul>
282      *
283      * @return sum of squared deviations of predicted y values
284      */
285     public double getRegressionSumSquares() {
286         return getRegressionSumSquares(getSlope());
287     }
288 
289     /**
290      * Returns the sum of squared errors divided by the degrees of freedom,
291      * usually abbreviated MSE. 
292      * <p>
293      * If there are fewer than <strong>three</strong> data pairs in the model,
294      * or if there is no variation in <code>x</code>, this returns 
295      * <code>Double.NaN</code>.
296      *
297      * @return sum of squared deviations of y values
298      */
299     public double getMeanSquareError() {
300         if (n < 3) {
301             return Double.NaN;
302         }
303         return getSumSquaredErrors() / (double) (n - 2);
304     }
305 
306     /**
307      * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
308      * Pearson's product moment correlation coefficient</a>,
309      * usually denoted r. 
310      * <p>
311      * <strong>Preconditions</strong>: <ul>
312      * <li>At least two observations (with at least two different x values)
313      * must have been added before invoking this method. If this method is 
314      * invoked before a model can be estimated, <code>Double,NaN</code> is
315      * returned.
316      * </li></ul>
317      *
318      * @return Pearson's r
319      */
320     public double getR() {
321         double b1 = getSlope();
322         double result = Math.sqrt(getRSquare(b1));
323         if (b1 < 0) {
324             result = -result;
325         }
326         return result;
327     }
328 
329     /** 
330      * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 
331      * coefficient of determination</a>,
332      * usually denoted r-square. 
333      * <p>
334      * <strong>Preconditions</strong>: <ul>
335      * <li>At least two observations (with at least two different x values)
336      * must have been added before invoking this method. If this method is 
337      * invoked before a model can be estimated, <code>Double,NaN</code> is
338      * returned.
339      * </li></ul>
340      *
341      * @return r-square
342      */
343     public double getRSquare() {
344         return getRSquare(getSlope());
345     }
346 
347     /**
348      * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
349      * standard error of the intercept estimate</a>, 
350      * usually denoted s(b0). 
351      * <p>
352      * If there are fewer that <strong>three</strong> observations in the 
353      * model, or if there is no variation in x, this returns 
354      * <code>Double.NaN</code>.
355      *
356      * @return standard error associated with intercept estimate
357      */
358     public double getInterceptStdErr() {
359         return Math.sqrt(
360             getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
361     }
362 
363     /**
364      * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
365      * error of the slope estimate</a>,
366      * usually denoted s(b1). 
367      * <p>
368      * If there are fewer that <strong>three</strong> data pairs in the model,
369      * or if there is no variation in x, this returns <code>Double.NaN</code>.
370      *
371      * @return standard error associated with slope estimate
372      */
373     public double getSlopeStdErr() {
374         return Math.sqrt(getMeanSquareError() / sumXX);
375     }
376 
377     /**
378      * Returns the half-width of a 95% confidence interval for the slope
379      * estimate.
380      * <p>
381      * The 95% confidence interval is 
382      * <p>
383      * <code>(getSlope() - getSlopeConfidenceInterval(), 
384      * getSlope() + getSlopeConfidenceInterval())</code>
385      * <p>
386      * If there are fewer that <strong>three</strong> observations in the 
387      * model, or if there is no variation in x, this returns 
388      * <code>Double.NaN</code>.
389      * <p>
390      * <strong>Usage Note</strong>:<br>
391      * The validity of this statistic depends on the assumption that the 
392      * observations included in the model are drawn from a
393      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
394      * Bivariate Normal Distribution</a>.
395      *
396      * @return half-width of 95% confidence interval for the slope estimate
397      * 
398      * @throws MathException if the confidence interval can not be computed.
399      */
400     public double getSlopeConfidenceInterval() throws MathException {
401         return getSlopeConfidenceInterval(0.05d);
402     }
403 
404     /**
405      * Returns the half-width of a (100-100*alpha)% confidence interval for 
406      * the slope estimate.
407      * <p>
408      * The (100-100*alpha)% confidence interval is 
409      * <p>
410      * <code>(getSlope() - getSlopeConfidenceInterval(), 
411      * getSlope() + getSlopeConfidenceInterval())</code>
412      * <p>
413      * To request, for example, a 99% confidence interval, use 
414      * <code>alpha = .01</code>
415      * <p>
416      * <strong>Usage Note</strong>:<br>
417      * The validity of this statistic depends on the assumption that the 
418      * observations included in the model are drawn from a
419      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
420      * Bivariate Normal Distribution</a>.
421      * <p>
422      * <strong> Preconditions:</strong><ul>
423      * <li>If there are fewer that <strong>three</strong> observations in the 
424      * model, or if there is no variation in x, this returns 
425      * <code>Double.NaN</code>. 
426      * </li>
427      * <li><code>(0 < alpha < 1)</code>; otherwise an 
428      * <code>IllegalArgumentException</code> is thrown.
429      * </li></ul>    
430      *
431      * @param alpha the desired significance level 
432      * @return half-width of 95% confidence interval for the slope estimate
433      * @throws MathException if the confidence interval can not be computed.
434      */
435     public double getSlopeConfidenceInterval(double alpha)
436         throws MathException {
437         if (alpha >= 1 || alpha <= 0) {
438             throw new IllegalArgumentException();
439         }
440         return getSlopeStdErr() *
441             getTDistribution().inverseCumulativeProbability(1d - alpha / 2d);
442     }
443 
444     /**
445      * Returns the significance level of the slope (equiv) correlation. 
446      * <p>
447      * Specifically, the returned value is the smallest <code>alpha</code>
448      * such that the slope confidence interval with significance level
449      * equal to <code>alpha</code> does not include <code>0</code>.
450      * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
451      * <p>
452      * <strong>Usage Note</strong>:<br>
453      * The validity of this statistic depends on the assumption that the 
454      * observations included in the model are drawn from a
455      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
456      * Bivariate Normal Distribution</a>.
457      * <p>
458      * If there are fewer that <strong>three</strong> observations in the 
459      * model, or if there is no variation in x, this returns 
460      * <code>Double.NaN</code>.
461      *
462      * @return significance level for slope/correlation
463      * @throws MathException if the significance level can not be computed.
464      */
465     public double getSignificance() throws MathException {
466         return 2d* (1.0 - getTDistribution().cumulativeProbability(
467                     Math.abs(getSlope()) / getSlopeStdErr()));
468     }
469 
470     // ---------------------Private methods-----------------------------------
471 
472     /**
473     * Returns the intercept of the estimated regression line, given the slope.
474     * <p>
475     * Will return <code>NaN</code> if slope is <code>NaN</code>.
476     *
477     * @param slope current slope
478     * @return the intercept of the regression line
479     */
480     private double getIntercept(double slope) {
481         return (sumY - slope * sumX) / ((double) n);
482     }
483 
484     /**
485      * Returns the sum of squared errors associated with the regression 
486      * model, using the slope of the regression line. 
487      * <p> 
488      * Returns NaN if the slope is NaN.
489      * 
490      * @param b1 current slope
491      * @return sum of squared errors associated with the regression model
492      */
493     private double getSumSquaredErrors(double b1) {
494         return sumYY - sumXY * sumXY / sumXX;
495     }
496 
497     /** 
498      * Computes r-square from the slope.
499      * <p>
500      * will return NaN if slope is Nan.
501      *
502      * @param b1 current slope
503      * @return r-square
504      */
505     private double getRSquare(double b1) {
506         double ssto = getTotalSumSquares();
507         return (ssto - getSumSquaredErrors(b1)) / ssto;
508     }
509 
510     /**
511      * Computes SSR from b1.
512      * 
513      * @param slope regression slope estimate
514      * @return sum of squared deviations of predicted y values
515      */
516     private double getRegressionSumSquares(double slope) {
517         return slope * slope * sumXX;
518     }
519 
520     /**
521      * Uses distribution framework to get a t distribution instance 
522      * with df = n - 2
523      *
524      * @return t distribution with df = n - 2
525      */
526     private TDistribution getTDistribution() {
527         return DistributionFactory.newInstance().createTDistribution(n - 2);
528     }
529 }