View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.math.stat.regression;
19  import java.io.Serializable;
20  
21  import org.apache.commons.math.MathException;
22  import org.apache.commons.math.MathRuntimeException;
23  import org.apache.commons.math.distribution.TDistribution;
24  import org.apache.commons.math.distribution.TDistributionImpl;
25  
26  /**
27   * Estimates an ordinary least squares regression model
28   * with one independent variable.
29   * <p>
30   * <code> y = intercept + slope * x  </code></p>
31   * <p>
32   * Standard errors for <code>intercept</code> and <code>slope</code> are 
33   * available as well as ANOVA, r-square and Pearson's r statistics.</p>
34   * <p>
35   * Observations (x,y pairs) can be added to the model one at a time or they 
36   * can be provided in a 2-dimensional array.  The observations are not stored
37   * in memory, so there is no limit to the number of observations that can be
38   * added to the model.</p> 
39   * <p>
40   * <strong>Usage Notes</strong>: <ul>
41   * <li> When there are fewer than two observations in the model, or when
42   * there is no variation in the x values (i.e. all x values are the same) 
43   * all statistics return <code>NaN</code>. At least two observations with
44   * different x coordinates are requred to estimate a bivariate regression 
45   * model.
46   * </li>
47   * <li> getters for the statistics always compute values based on the current
48   * set of observations -- i.e., you can get statistics, then add more data
49   * and get updated statistics without using a new instance.  There is no 
50   * "compute" method that updates all statistics.  Each of the getters performs
51   * the necessary computations to return the requested statistic.</li>
52   * </ul></p>
53   *
54   * @version $Revision: 773189 $ $Date: 2009-05-09 05:57:04 -0400 (Sat, 09 May 2009) $
55   */
56  public class SimpleRegression implements Serializable {
57  
58      /** Serializable version identifier */
59      private static final long serialVersionUID = -3004689053607543335L;
60  
61      /** the distribution used to compute inference statistics. */
62      private TDistribution distribution;
63      
64      /** sum of x values */
65      private double sumX = 0d;
66  
67      /** total variation in x (sum of squared deviations from xbar) */
68      private double sumXX = 0d;
69  
70      /** sum of y values */
71      private double sumY = 0d;
72  
73      /** total variation in y (sum of squared deviations from ybar) */
74      private double sumYY = 0d;
75  
76      /** sum of products */
77      private double sumXY = 0d;
78  
79      /** number of observations */
80      private long n = 0;
81  
82      /** mean of accumulated x values, used in updating formulas */
83      private double xbar = 0;
84  
85      /** mean of accumulated y values, used in updating formulas */
86      private double ybar = 0;
87  
88      // ---------------------Public methods--------------------------------------
89  
90      /**
91       * Create an empty SimpleRegression instance
92       */
93      public SimpleRegression() {
94          this(new TDistributionImpl(1.0));
95      }
96      
97      /**
98       * Create an empty SimpleRegression using the given distribution object to
99       * compute inference statistics.
100      * @param t the distribution used to compute inference statistics.
101      * @since 1.2
102      */
103     public SimpleRegression(TDistribution t) {
104         super();
105         setDistribution(t);
106     }
107     
108     /**
109      * Adds the observation (x,y) to the regression data set.
110      * <p>
111      * Uses updating formulas for means and sums of squares defined in 
112      * "Algorithms for Computing the Sample Variance: Analysis and
113      * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 
114      * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
115      * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
116      *
117      *
118      * @param x independent variable value
119      * @param y dependent variable value
120      */
121     public void addData(double x, double y) {
122         if (n == 0) {
123             xbar = x;
124             ybar = y;
125         } else {
126             double dx = x - xbar;
127             double dy = y - ybar;
128             sumXX += dx * dx * (double) n / (n + 1d);
129             sumYY += dy * dy * (double) n / (n + 1d);
130             sumXY += dx * dy * (double) n / (n + 1d);
131             xbar += dx / (n + 1.0);
132             ybar += dy / (n + 1.0);
133         }
134         sumX += x;
135         sumY += y;
136         n++;
137         
138         if (n > 2) {
139             distribution.setDegreesOfFreedom(n - 2);
140         }
141     }
142 
143     
144     /**
145      * Removes the observation (x,y) from the regression data set.
146      * <p>
147      * Mirrors the addData method.  This method permits the use of 
148      * SimpleRegression instances in streaming mode where the regression 
149      * is applied to a sliding "window" of observations, however the caller is 
150      * responsible for maintaining the set of observations in the window.</p>
151      * 
152      * The method has no effect if there are no points of data (i.e. n=0)
153      *
154      * @param x independent variable value
155      * @param y dependent variable value
156      */
157     public void removeData(double x, double y) {
158         if (n > 0) {
159             double dx = x - xbar;
160             double dy = y - ybar;
161             sumXX -= dx * dx * (double) n / (n - 1d);
162             sumYY -= dy * dy * (double) n / (n - 1d);
163             sumXY -= dx * dy * (double) n / (n - 1d);
164             xbar -= dx / (n - 1.0);
165             ybar -= dy / (n - 1.0);
166             sumX -= x;
167             sumY -= y;
168             n--;
169             
170             if (n > 2) {
171                 distribution.setDegreesOfFreedom(n - 2);
172             } 
173         }
174     }
175 
176     /**
177      * Adds the observations represented by the elements in 
178      * <code>data</code>.
179      * <p>
180      * <code>(data[0][0],data[0][1])</code> will be the first observation, then
181      * <code>(data[1][0],data[1][1])</code>, etc.</p>
182      * <p> 
183      * This method does not replace data that has already been added.  The
184      * observations represented by <code>data</code> are added to the existing
185      * dataset.</p>
186      * <p> 
187      * To replace all data, use <code>clear()</code> before adding the new 
188      * data.</p>
189      * 
190      * @param data array of observations to be added
191      */
192     public void addData(double[][] data) {
193         for (int i = 0; i < data.length; i++) {
194             addData(data[i][0], data[i][1]);
195         }
196     }
197 
198 
199     /**
200      * Removes observations represented by the elements in <code>data</code>.
201       * <p> 
202      * If the array is larger than the current n, only the first n elements are 
203      * processed.  This method permits the use of SimpleRegression instances in 
204      * streaming mode where the regression is applied to a sliding "window" of 
205      * observations, however the caller is responsible for maintaining the set 
206      * of observations in the window.</p>
207      * <p> 
208      * To remove all data, use <code>clear()</code>.</p>
209      * 
210      * @param data array of observations to be removed
211      */
212     public void removeData(double[][] data) {
213         for (int i = 0; i < data.length && n > 0; i++) {
214             removeData(data[i][0], data[i][1]);
215         }
216     }
217 
218     /**
219      * Clears all data from the model.
220      */
221     public void clear() {
222         sumX = 0d;
223         sumXX = 0d;
224         sumY = 0d;
225         sumYY = 0d;
226         sumXY = 0d;
227         n = 0;
228     }
229 
230     /**
231      * Returns the number of observations that have been added to the model.
232      *
233      * @return n number of observations that have been added.
234      */
235     public long getN() {
236         return n;
237     }
238 
239     /**
240      * Returns the "predicted" <code>y</code> value associated with the 
241      * supplied <code>x</code> value,  based on the data that has been
242      * added to the model when this method is activated.
243      * <p>
244      * <code> predict(x) = intercept + slope * x </code></p>
245      * <p>
246      * <strong>Preconditions</strong>: <ul>
247      * <li>At least two observations (with at least two different x values)
248      * must have been added before invoking this method. If this method is 
249      * invoked before a model can be estimated, <code>Double,NaN</code> is
250      * returned.
251      * </li></ul></p>
252      *
253      * @param x input <code>x</code> value
254      * @return predicted <code>y</code> value
255      */
256     public double predict(double x) {
257         double b1 = getSlope();
258         return getIntercept(b1) + b1 * x;
259     }
260 
261     /**
262      * Returns the intercept of the estimated regression line.
263      * <p>
264      * The least squares estimate of the intercept is computed using the 
265      * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
266      * The intercept is sometimes denoted b0.</p>
267      * <p>
268      * <strong>Preconditions</strong>: <ul>
269      * <li>At least two observations (with at least two different x values)
270      * must have been added before invoking this method. If this method is 
271      * invoked before a model can be estimated, <code>Double,NaN</code> is
272      * returned.
273      * </li></ul></p>
274      *
275      * @return the intercept of the regression line
276      */
277     public double getIntercept() {
278         return getIntercept(getSlope());
279     }
280 
281     /**
282     * Returns the slope of the estimated regression line.  
283     * <p>
284     * The least squares estimate of the slope is computed using the 
285     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
286     * The slope is sometimes denoted b1.</p>
287     * <p>
288     * <strong>Preconditions</strong>: <ul>
289     * <li>At least two observations (with at least two different x values)
290     * must have been added before invoking this method. If this method is 
291     * invoked before a model can be estimated, <code>Double.NaN</code> is
292     * returned.
293     * </li></ul></p>
294     *
295     * @return the slope of the regression line
296     */
297     public double getSlope() {
298         if (n < 2) {
299             return Double.NaN; //not enough data 
300         }
301         if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {
302             return Double.NaN; //not enough variation in x
303         }
304         return sumXY / sumXX;
305     }
306 
307     /**
308      * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
309      * sum of squared errors</a> (SSE) associated with the regression 
310      * model.
311      * <p>
312      * The sum is computed using the computational formula</p>
313      * <p>
314      * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
315      * <p>
316      * where <code>SYY</code> is the sum of the squared deviations of the y
317      * values about their mean, <code>SXX</code> is similarly defined and
318      * <code>SXY</code> is the sum of the products of x and y mean deviations.
319      * </p><p>
320      * The sums are accumulated using the updating algorithm referenced in 
321      * {@link #addData}.</p>
322      * <p>
323      * The return value is constrained to be non-negative - i.e., if due to 
324      * rounding errors the computational formula returns a negative result, 
325      * 0 is returned.</p>
326      * <p>
327      * <strong>Preconditions</strong>: <ul>
328      * <li>At least two observations (with at least two different x values)
329      * must have been added before invoking this method. If this method is 
330      * invoked before a model can be estimated, <code>Double,NaN</code> is
331      * returned.
332      * </li></ul></p>
333      *
334      * @return sum of squared errors associated with the regression model
335      */
336     public double getSumSquaredErrors() {
337         return Math.max(0d, sumYY - sumXY * sumXY / sumXX);
338     }
339 
340     /**
341      * Returns the sum of squared deviations of the y values about their mean.
342      * <p>
343      * This is defined as SSTO 
344      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
345      * <p>
346      * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
347      *
348      * @return sum of squared deviations of y values
349      */
350     public double getTotalSumSquares() {
351         if (n < 2) {
352             return Double.NaN;
353         }
354         return sumYY;
355     }
356     
357     /**
358      * Returns the sum of squared deviations of the x values about their mean.
359      *
360      * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
361      *
362      * @return sum of squared deviations of x values
363      */
364     public double getXSumSquares() {
365         if (n < 2) {
366             return Double.NaN;
367         }
368         return sumXX;
369     }
370     
371     /**
372      * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
373      *
374      * @return sum of cross products
375      */
376     public double getSumOfCrossProducts() {
377         return sumXY;
378     }
379 
380     /**
381      * Returns the sum of squared deviations of the predicted y values about 
382      * their mean (which equals the mean of y).
383      * <p>
384      * This is usually abbreviated SSR or SSM.  It is defined as SSM 
385      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
386      * <p>
387      * <strong>Preconditions</strong>: <ul>
388      * <li>At least two observations (with at least two different x values)
389      * must have been added before invoking this method. If this method is 
390      * invoked before a model can be estimated, <code>Double.NaN</code> is
391      * returned.
392      * </li></ul></p>
393      *
394      * @return sum of squared deviations of predicted y values
395      */
396     public double getRegressionSumSquares() {
397         return getRegressionSumSquares(getSlope());
398     }
399 
400     /**
401      * Returns the sum of squared errors divided by the degrees of freedom,
402      * usually abbreviated MSE. 
403      * <p>
404      * If there are fewer than <strong>three</strong> data pairs in the model,
405      * or if there is no variation in <code>x</code>, this returns 
406      * <code>Double.NaN</code>.</p>
407      *
408      * @return sum of squared deviations of y values
409      */
410     public double getMeanSquareError() {
411         if (n < 3) {
412             return Double.NaN;
413         }
414         return getSumSquaredErrors() / (n - 2);
415     }
416 
417     /**
418      * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
419      * Pearson's product moment correlation coefficient</a>,
420      * usually denoted r. 
421      * <p>
422      * <strong>Preconditions</strong>: <ul>
423      * <li>At least two observations (with at least two different x values)
424      * must have been added before invoking this method. If this method is 
425      * invoked before a model can be estimated, <code>Double,NaN</code> is
426      * returned.
427      * </li></ul></p>
428      *
429      * @return Pearson's r
430      */
431     public double getR() {
432         double b1 = getSlope();
433         double result = Math.sqrt(getRSquare());
434         if (b1 < 0) {
435             result = -result;
436         }
437         return result;
438     }
439 
440     /** 
441      * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 
442      * coefficient of determination</a>,
443      * usually denoted r-square. 
444      * <p>
445      * <strong>Preconditions</strong>: <ul>
446      * <li>At least two observations (with at least two different x values)
447      * must have been added before invoking this method. If this method is 
448      * invoked before a model can be estimated, <code>Double,NaN</code> is
449      * returned.
450      * </li></ul></p>
451      *
452      * @return r-square
453      */
454     public double getRSquare() {
455         double ssto = getTotalSumSquares();
456         return (ssto - getSumSquaredErrors()) / ssto;
457     }
458 
459     /**
460      * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
461      * standard error of the intercept estimate</a>, 
462      * usually denoted s(b0). 
463      * <p>
464      * If there are fewer that <strong>three</strong> observations in the 
465      * model, or if there is no variation in x, this returns 
466      * <code>Double.NaN</code>.</p>
467      *
468      * @return standard error associated with intercept estimate
469      */
470     public double getInterceptStdErr() {
471         return Math.sqrt(
472             getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
473     }
474 
475     /**
476      * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
477      * error of the slope estimate</a>,
478      * usually denoted s(b1). 
479      * <p>
480      * If there are fewer that <strong>three</strong> data pairs in the model,
481      * or if there is no variation in x, this returns <code>Double.NaN</code>.
482      * </p>
483      * 
484      * @return standard error associated with slope estimate
485      */
486     public double getSlopeStdErr() {
487         return Math.sqrt(getMeanSquareError() / sumXX);
488     }
489 
490     /**
491      * Returns the half-width of a 95% confidence interval for the slope
492      * estimate.
493      * <p>
494      * The 95% confidence interval is</p>
495      * <p>
496      * <code>(getSlope() - getSlopeConfidenceInterval(), 
497      * getSlope() + getSlopeConfidenceInterval())</code></p>
498      * <p>
499      * If there are fewer that <strong>three</strong> observations in the 
500      * model, or if there is no variation in x, this returns 
501      * <code>Double.NaN</code>.</p>
502      * <p>
503      * <strong>Usage Note</strong>:<br>
504      * The validity of this statistic depends on the assumption that the 
505      * observations included in the model are drawn from a
506      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
507      * Bivariate Normal Distribution</a>.</p>
508      *
509      * @return half-width of 95% confidence interval for the slope estimate
510      * @throws MathException if the confidence interval can not be computed.
511      */
512     public double getSlopeConfidenceInterval() throws MathException {
513         return getSlopeConfidenceInterval(0.05d);
514     }
515 
516     /**
517      * Returns the half-width of a (100-100*alpha)% confidence interval for 
518      * the slope estimate.
519      * <p>
520      * The (100-100*alpha)% confidence interval is </p>
521      * <p>
522      * <code>(getSlope() - getSlopeConfidenceInterval(), 
523      * getSlope() + getSlopeConfidenceInterval())</code></p>
524      * <p>
525      * To request, for example, a 99% confidence interval, use 
526      * <code>alpha = .01</code></p>
527      * <p>
528      * <strong>Usage Note</strong>:<br>
529      * The validity of this statistic depends on the assumption that the 
530      * observations included in the model are drawn from a
531      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
532      * Bivariate Normal Distribution</a>.</p>
533      * <p>
534      * <strong> Preconditions:</strong><ul>
535      * <li>If there are fewer that <strong>three</strong> observations in the 
536      * model, or if there is no variation in x, this returns 
537      * <code>Double.NaN</code>.
538      * </li>
539      * <li><code>(0 < alpha < 1)</code>; otherwise an 
540      * <code>IllegalArgumentException</code> is thrown.
541      * </li></ul></p> 
542      *
543      * @param alpha the desired significance level 
544      * @return half-width of 95% confidence interval for the slope estimate
545      * @throws MathException if the confidence interval can not be computed.
546      */
547     public double getSlopeConfidenceInterval(double alpha)
548         throws MathException {
549         if (alpha >= 1 || alpha <= 0) {
550             throw MathRuntimeException.createIllegalArgumentException(
551                   "out of bounds significance level {0}, must be between {1} and {2}",
552                   alpha, 0.0, 1.0);
553         }
554         return getSlopeStdErr() *
555             distribution.inverseCumulativeProbability(1d - alpha / 2d);
556     }
557 
558     /**
559      * Returns the significance level of the slope (equiv) correlation. 
560      * <p>
561      * Specifically, the returned value is the smallest <code>alpha</code>
562      * such that the slope confidence interval with significance level
563      * equal to <code>alpha</code> does not include <code>0</code>.
564      * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
565      * </p><p>
566      * <strong>Usage Note</strong>:<br>
567      * The validity of this statistic depends on the assumption that the 
568      * observations included in the model are drawn from a
569      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
570      * Bivariate Normal Distribution</a>.</p>
571      * <p>
572      * If there are fewer that <strong>three</strong> observations in the 
573      * model, or if there is no variation in x, this returns 
574      * <code>Double.NaN</code>.</p>
575      *
576      * @return significance level for slope/correlation
577      * @throws MathException if the significance level can not be computed.
578      */
579     public double getSignificance() throws MathException {
580         return 2d * (1.0 - distribution.cumulativeProbability(
581                     Math.abs(getSlope()) / getSlopeStdErr()));
582     }
583 
584     // ---------------------Private methods-----------------------------------
585 
586     /**
587     * Returns the intercept of the estimated regression line, given the slope.
588     * <p>
589     * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
590     *
591     * @param slope current slope
592     * @return the intercept of the regression line
593     */
594     private double getIntercept(double slope) {
595         return (sumY - slope * sumX) / n;
596     }
597 
598     /**
599      * Computes SSR from b1.
600      * 
601      * @param slope regression slope estimate
602      * @return sum of squared deviations of predicted y values
603      */
604     private double getRegressionSumSquares(double slope) {
605         return slope * slope * sumXX;
606     }
607     
608     /**
609      * Modify the distribution used to compute inference statistics.
610      * @param value the new distribution
611      * @since 1.2
612      */
613     public void setDistribution(TDistribution value) {
614         distribution = value;
615         
616         // modify degrees of freedom
617         if (n > 2) {
618             distribution.setDegreesOfFreedom(n - 2);
619         }
620     }
621 }