1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 package org.apache.commons.math.stat.regression;
19 import java.io.Serializable;
20 
21 import org.apache.commons.math.MathException;
22 import org.apache.commons.math.MathRuntimeException;
23 import org.apache.commons.math.distribution.TDistribution;
24 import org.apache.commons.math.distribution.TDistributionImpl;
25 import org.apache.commons.math.exception.util.LocalizedFormats;
26 import org.apache.commons.math.util.FastMath;
27 
28 /**
29  * Estimates an ordinary least squares regression model
30  * with one independent variable.
31  * <p>
32  * <code> y = intercept + slope * x  </code></p>
33  * <p>
34  * Standard errors for <code>intercept</code> and <code>slope</code> are
35  * available as well as ANOVA, r-square and Pearson's r statistics.</p>
36  * <p>
37  * Observations (x,y pairs) can be added to the model one at a time or they
38  * can be provided in a 2-dimensional array.  The observations are not stored
39  * in memory, so there is no limit to the number of observations that can be
40  * added to the model.</p>
41  * <p>
42  * <strong>Usage Notes</strong>: <ul>
43  * <li> When there are fewer than two observations in the model, or when
44  * there is no variation in the x values (i.e. all x values are the same)
45  * all statistics return <code>NaN</code>. At least two observations with
46  * different x coordinates are requred to estimate a bivariate regression
47  * model.
48  * </li>
49  * <li> getters for the statistics always compute values based on the current
50  * set of observations -- i.e., you can get statistics, then add more data
51  * and get updated statistics without using a new instance.  There is no
52  * "compute" method that updates all statistics.  Each of the getters performs
53  * the necessary computations to return the requested statistic.</li>
54  * </ul></p>
55  *
56  * @version $Revision: 1042336 $ $Date: 2010-12-05 13:40:48 +0100 (dim. 05 déc. 2010) $
57  */
58 public class SimpleRegression implements Serializable {
59 
60     /** Serializable version identifier */
61     private static final long serialVersionUID = -3004689053607543335L;
62 
63     /** the distribution used to compute inference statistics. */
64     private TDistribution distribution;
65 
66     /** sum of x values */
67     private double sumX = 0d;
68 
69     /** total variation in x (sum of squared deviations from xbar) */
70     private double sumXX = 0d;
71 
72     /** sum of y values */
73     private double sumY = 0d;
74 
75     /** total variation in y (sum of squared deviations from ybar) */
76     private double sumYY = 0d;
77 
78     /** sum of products */
79     private double sumXY = 0d;
80 
81     /** number of observations */
82     private long n = 0;
83 
84     /** mean of accumulated x values, used in updating formulas */
85     private double xbar = 0;
86 
87     /** mean of accumulated y values, used in updating formulas */
88     private double ybar = 0;
89 
90     // ---------------------Public methods--------------------------------------
91 
92     /**
93      * Create an empty SimpleRegression instance
94      */
SimpleRegression()95     public SimpleRegression() {
96         this(new TDistributionImpl(1.0));
97     }
98 
99     /**
100      * Create an empty SimpleRegression using the given distribution object to
101      * compute inference statistics.
102      * @param t the distribution used to compute inference statistics.
103      * @since 1.2
104      * @deprecated in 2.2 (to be removed in 3.0). Please use the {@link
105      * #SimpleRegression(int) other constructor} instead.
106      */
107     @Deprecated
SimpleRegression(TDistribution t)108     public SimpleRegression(TDistribution t) {
109         super();
110         setDistribution(t);
111     }
112 
113     /**
114      * Create an empty SimpleRegression.
115      *
116      * @param degrees Number of degrees of freedom of the distribution
117      * used to compute inference statistics.
118      * @since 2.2
119      */
SimpleRegression(int degrees)120     public SimpleRegression(int degrees) {
121         setDistribution(new TDistributionImpl(degrees));
122     }
123 
124     /**
125      * Adds the observation (x,y) to the regression data set.
126      * <p>
127      * Uses updating formulas for means and sums of squares defined in
128      * "Algorithms for Computing the Sample Variance: Analysis and
129      * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
130      * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
131      * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
132      *
133      *
134      * @param x independent variable value
135      * @param y dependent variable value
136      */
addData(double x, double y)137     public void addData(double x, double y) {
138         if (n == 0) {
139             xbar = x;
140             ybar = y;
141         } else {
142             double dx = x - xbar;
143             double dy = y - ybar;
144             sumXX += dx * dx * (double) n / (n + 1d);
145             sumYY += dy * dy * (double) n / (n + 1d);
146             sumXY += dx * dy * (double) n / (n + 1d);
147             xbar += dx / (n + 1.0);
148             ybar += dy / (n + 1.0);
149         }
150         sumX += x;
151         sumY += y;
152         n++;
153 
154         if (n > 2) {
155             distribution.setDegreesOfFreedom(n - 2);
156         }
157     }
158 
159 
160     /**
161      * Removes the observation (x,y) from the regression data set.
162      * <p>
163      * Mirrors the addData method.  This method permits the use of
164      * SimpleRegression instances in streaming mode where the regression
165      * is applied to a sliding "window" of observations, however the caller is
166      * responsible for maintaining the set of observations in the window.</p>
167      *
168      * The method has no effect if there are no points of data (i.e. n=0)
169      *
170      * @param x independent variable value
171      * @param y dependent variable value
172      */
removeData(double x, double y)173     public void removeData(double x, double y) {
174         if (n > 0) {
175             double dx = x - xbar;
176             double dy = y - ybar;
177             sumXX -= dx * dx * (double) n / (n - 1d);
178             sumYY -= dy * dy * (double) n / (n - 1d);
179             sumXY -= dx * dy * (double) n / (n - 1d);
180             xbar -= dx / (n - 1.0);
181             ybar -= dy / (n - 1.0);
182             sumX -= x;
183             sumY -= y;
184             n--;
185 
186             if (n > 2) {
187                 distribution.setDegreesOfFreedom(n - 2);
188             }
189         }
190     }
191 
192     /**
193      * Adds the observations represented by the elements in
194      * <code>data</code>.
195      * <p>
196      * <code>(data[0][0],data[0][1])</code> will be the first observation, then
197      * <code>(data[1][0],data[1][1])</code>, etc.</p>
198      * <p>
199      * This method does not replace data that has already been added.  The
200      * observations represented by <code>data</code> are added to the existing
201      * dataset.</p>
202      * <p>
203      * To replace all data, use <code>clear()</code> before adding the new
204      * data.</p>
205      *
206      * @param data array of observations to be added
207      */
addData(double[][] data)208     public void addData(double[][] data) {
209         for (int i = 0; i < data.length; i++) {
210             addData(data[i][0], data[i][1]);
211         }
212     }
213 
214 
215     /**
216      * Removes observations represented by the elements in <code>data</code>.
217       * <p>
218      * If the array is larger than the current n, only the first n elements are
219      * processed.  This method permits the use of SimpleRegression instances in
220      * streaming mode where the regression is applied to a sliding "window" of
221      * observations, however the caller is responsible for maintaining the set
222      * of observations in the window.</p>
223      * <p>
224      * To remove all data, use <code>clear()</code>.</p>
225      *
226      * @param data array of observations to be removed
227      */
removeData(double[][] data)228     public void removeData(double[][] data) {
229         for (int i = 0; i < data.length && n > 0; i++) {
230             removeData(data[i][0], data[i][1]);
231         }
232     }
233 
234     /**
235      * Clears all data from the model.
236      */
clear()237     public void clear() {
238         sumX = 0d;
239         sumXX = 0d;
240         sumY = 0d;
241         sumYY = 0d;
242         sumXY = 0d;
243         n = 0;
244     }
245 
246     /**
247      * Returns the number of observations that have been added to the model.
248      *
249      * @return n number of observations that have been added.
250      */
getN()251     public long getN() {
252         return n;
253     }
254 
255     /**
256      * Returns the "predicted" <code>y</code> value associated with the
257      * supplied <code>x</code> value,  based on the data that has been
258      * added to the model when this method is activated.
259      * <p>
260      * <code> predict(x) = intercept + slope * x </code></p>
261      * <p>
262      * <strong>Preconditions</strong>: <ul>
263      * <li>At least two observations (with at least two different x values)
264      * must have been added before invoking this method. If this method is
265      * invoked before a model can be estimated, <code>Double,NaN</code> is
266      * returned.
267      * </li></ul></p>
268      *
269      * @param x input <code>x</code> value
270      * @return predicted <code>y</code> value
271      */
predict(double x)272     public double predict(double x) {
273         double b1 = getSlope();
274         return getIntercept(b1) + b1 * x;
275     }
276 
277     /**
278      * Returns the intercept of the estimated regression line.
279      * <p>
280      * The least squares estimate of the intercept is computed using the
281      * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
282      * The intercept is sometimes denoted b0.</p>
283      * <p>
284      * <strong>Preconditions</strong>: <ul>
285      * <li>At least two observations (with at least two different x values)
286      * must have been added before invoking this method. If this method is
287      * invoked before a model can be estimated, <code>Double,NaN</code> is
288      * returned.
289      * </li></ul></p>
290      *
291      * @return the intercept of the regression line
292      */
getIntercept()293     public double getIntercept() {
294         return getIntercept(getSlope());
295     }
296 
297     /**
298     * Returns the slope of the estimated regression line.
299     * <p>
300     * The least squares estimate of the slope is computed using the
301     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
302     * The slope is sometimes denoted b1.</p>
303     * <p>
304     * <strong>Preconditions</strong>: <ul>
305     * <li>At least two observations (with at least two different x values)
306     * must have been added before invoking this method. If this method is
307     * invoked before a model can be estimated, <code>Double.NaN</code> is
308     * returned.
309     * </li></ul></p>
310     *
311     * @return the slope of the regression line
312     */
getSlope()313     public double getSlope() {
314         if (n < 2) {
315             return Double.NaN; //not enough data
316         }
317         if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
318             return Double.NaN; //not enough variation in x
319         }
320         return sumXY / sumXX;
321     }
322 
323     /**
324      * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
325      * sum of squared errors</a> (SSE) associated with the regression
326      * model.
327      * <p>
328      * The sum is computed using the computational formula</p>
329      * <p>
330      * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
331      * <p>
332      * where <code>SYY</code> is the sum of the squared deviations of the y
333      * values about their mean, <code>SXX</code> is similarly defined and
334      * <code>SXY</code> is the sum of the products of x and y mean deviations.
335      * </p><p>
336      * The sums are accumulated using the updating algorithm referenced in
337      * {@link #addData}.</p>
338      * <p>
339      * The return value is constrained to be non-negative - i.e., if due to
340      * rounding errors the computational formula returns a negative result,
341      * 0 is returned.</p>
342      * <p>
343      * <strong>Preconditions</strong>: <ul>
344      * <li>At least two observations (with at least two different x values)
345      * must have been added before invoking this method. If this method is
346      * invoked before a model can be estimated, <code>Double,NaN</code> is
347      * returned.
348      * </li></ul></p>
349      *
350      * @return sum of squared errors associated with the regression model
351      */
getSumSquaredErrors()352     public double getSumSquaredErrors() {
353         return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX);
354     }
355 
356     /**
357      * Returns the sum of squared deviations of the y values about their mean.
358      * <p>
359      * This is defined as SSTO
360      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
361      * <p>
362      * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
363      *
364      * @return sum of squared deviations of y values
365      */
getTotalSumSquares()366     public double getTotalSumSquares() {
367         if (n < 2) {
368             return Double.NaN;
369         }
370         return sumYY;
371     }
372 
373     /**
374      * Returns the sum of squared deviations of the x values about their mean.
375      *
376      * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
377      *
378      * @return sum of squared deviations of x values
379      */
getXSumSquares()380     public double getXSumSquares() {
381         if (n < 2) {
382             return Double.NaN;
383         }
384         return sumXX;
385     }
386 
387     /**
388      * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
389      *
390      * @return sum of cross products
391      */
getSumOfCrossProducts()392     public double getSumOfCrossProducts() {
393         return sumXY;
394     }
395 
396     /**
397      * Returns the sum of squared deviations of the predicted y values about
398      * their mean (which equals the mean of y).
399      * <p>
400      * This is usually abbreviated SSR or SSM.  It is defined as SSM
401      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
402      * <p>
403      * <strong>Preconditions</strong>: <ul>
404      * <li>At least two observations (with at least two different x values)
405      * must have been added before invoking this method. If this method is
406      * invoked before a model can be estimated, <code>Double.NaN</code> is
407      * returned.
408      * </li></ul></p>
409      *
410      * @return sum of squared deviations of predicted y values
411      */
getRegressionSumSquares()412     public double getRegressionSumSquares() {
413         return getRegressionSumSquares(getSlope());
414     }
415 
416     /**
417      * Returns the sum of squared errors divided by the degrees of freedom,
418      * usually abbreviated MSE.
419      * <p>
420      * If there are fewer than <strong>three</strong> data pairs in the model,
421      * or if there is no variation in <code>x</code>, this returns
422      * <code>Double.NaN</code>.</p>
423      *
424      * @return sum of squared deviations of y values
425      */
getMeanSquareError()426     public double getMeanSquareError() {
427         if (n < 3) {
428             return Double.NaN;
429         }
430         return getSumSquaredErrors() / (n - 2);
431     }
432 
433     /**
434      * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
435      * Pearson's product moment correlation coefficient</a>,
436      * usually denoted r.
437      * <p>
438      * <strong>Preconditions</strong>: <ul>
439      * <li>At least two observations (with at least two different x values)
440      * must have been added before invoking this method. If this method is
441      * invoked before a model can be estimated, <code>Double,NaN</code> is
442      * returned.
443      * </li></ul></p>
444      *
445      * @return Pearson's r
446      */
getR()447     public double getR() {
448         double b1 = getSlope();
449         double result = FastMath.sqrt(getRSquare());
450         if (b1 < 0) {
451             result = -result;
452         }
453         return result;
454     }
455 
456     /**
457      * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
458      * coefficient of determination</a>,
459      * usually denoted r-square.
460      * <p>
461      * <strong>Preconditions</strong>: <ul>
462      * <li>At least two observations (with at least two different x values)
463      * must have been added before invoking this method. If this method is
464      * invoked before a model can be estimated, <code>Double,NaN</code> is
465      * returned.
466      * </li></ul></p>
467      *
468      * @return r-square
469      */
getRSquare()470     public double getRSquare() {
471         double ssto = getTotalSumSquares();
472         return (ssto - getSumSquaredErrors()) / ssto;
473     }
474 
475     /**
476      * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
477      * standard error of the intercept estimate</a>,
478      * usually denoted s(b0).
479      * <p>
480      * If there are fewer that <strong>three</strong> observations in the
481      * model, or if there is no variation in x, this returns
482      * <code>Double.NaN</code>.</p>
483      *
484      * @return standard error associated with intercept estimate
485      */
getInterceptStdErr()486     public double getInterceptStdErr() {
487         return FastMath.sqrt(
488             getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
489     }
490 
491     /**
492      * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
493      * error of the slope estimate</a>,
494      * usually denoted s(b1).
495      * <p>
496      * If there are fewer that <strong>three</strong> data pairs in the model,
497      * or if there is no variation in x, this returns <code>Double.NaN</code>.
498      * </p>
499      *
500      * @return standard error associated with slope estimate
501      */
getSlopeStdErr()502     public double getSlopeStdErr() {
503         return FastMath.sqrt(getMeanSquareError() / sumXX);
504     }
505 
506     /**
507      * Returns the half-width of a 95% confidence interval for the slope
508      * estimate.
509      * <p>
510      * The 95% confidence interval is</p>
511      * <p>
512      * <code>(getSlope() - getSlopeConfidenceInterval(),
513      * getSlope() + getSlopeConfidenceInterval())</code></p>
514      * <p>
515      * If there are fewer that <strong>three</strong> observations in the
516      * model, or if there is no variation in x, this returns
517      * <code>Double.NaN</code>.</p>
518      * <p>
519      * <strong>Usage Note</strong>:<br>
520      * The validity of this statistic depends on the assumption that the
521      * observations included in the model are drawn from a
522      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
523      * Bivariate Normal Distribution</a>.</p>
524      *
525      * @return half-width of 95% confidence interval for the slope estimate
526      * @throws MathException if the confidence interval can not be computed.
527      */
getSlopeConfidenceInterval()528     public double getSlopeConfidenceInterval() throws MathException {
529         return getSlopeConfidenceInterval(0.05d);
530     }
531 
532     /**
533      * Returns the half-width of a (100-100*alpha)% confidence interval for
534      * the slope estimate.
535      * <p>
536      * The (100-100*alpha)% confidence interval is </p>
537      * <p>
538      * <code>(getSlope() - getSlopeConfidenceInterval(),
539      * getSlope() + getSlopeConfidenceInterval())</code></p>
540      * <p>
541      * To request, for example, a 99% confidence interval, use
542      * <code>alpha = .01</code></p>
543      * <p>
544      * <strong>Usage Note</strong>:<br>
545      * The validity of this statistic depends on the assumption that the
546      * observations included in the model are drawn from a
547      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
548      * Bivariate Normal Distribution</a>.</p>
549      * <p>
550      * <strong> Preconditions:</strong><ul>
551      * <li>If there are fewer that <strong>three</strong> observations in the
552      * model, or if there is no variation in x, this returns
553      * <code>Double.NaN</code>.
554      * </li>
555      * <li><code>(0 < alpha < 1)</code>; otherwise an
556      * <code>IllegalArgumentException</code> is thrown.
557      * </li></ul></p>
558      *
559      * @param alpha the desired significance level
560      * @return half-width of 95% confidence interval for the slope estimate
561      * @throws MathException if the confidence interval can not be computed.
562      */
getSlopeConfidenceInterval(double alpha)563     public double getSlopeConfidenceInterval(double alpha)
564         throws MathException {
565         if (alpha >= 1 || alpha <= 0) {
566             throw MathRuntimeException.createIllegalArgumentException(
567                   LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
568                   alpha, 0.0, 1.0);
569         }
570         return getSlopeStdErr() *
571             distribution.inverseCumulativeProbability(1d - alpha / 2d);
572     }
573 
574     /**
575      * Returns the significance level of the slope (equiv) correlation.
576      * <p>
577      * Specifically, the returned value is the smallest <code>alpha</code>
578      * such that the slope confidence interval with significance level
579      * equal to <code>alpha</code> does not include <code>0</code>.
580      * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
581      * </p><p>
582      * <strong>Usage Note</strong>:<br>
583      * The validity of this statistic depends on the assumption that the
584      * observations included in the model are drawn from a
585      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
586      * Bivariate Normal Distribution</a>.</p>
587      * <p>
588      * If there are fewer that <strong>three</strong> observations in the
589      * model, or if there is no variation in x, this returns
590      * <code>Double.NaN</code>.</p>
591      *
592      * @return significance level for slope/correlation
593      * @throws MathException if the significance level can not be computed.
594      */
getSignificance()595     public double getSignificance() throws MathException {
596         return 2d * (1.0 - distribution.cumulativeProbability(
597                     FastMath.abs(getSlope()) / getSlopeStdErr()));
598     }
599 
600     // ---------------------Private methods-----------------------------------
601 
602     /**
603     * Returns the intercept of the estimated regression line, given the slope.
604     * <p>
605     * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
606     *
607     * @param slope current slope
608     * @return the intercept of the regression line
609     */
getIntercept(double slope)610     private double getIntercept(double slope) {
611         return (sumY - slope * sumX) / n;
612     }
613 
614     /**
615      * Computes SSR from b1.
616      *
617      * @param slope regression slope estimate
618      * @return sum of squared deviations of predicted y values
619      */
getRegressionSumSquares(double slope)620     private double getRegressionSumSquares(double slope) {
621         return slope * slope * sumXX;
622     }
623 
624     /**
625      * Modify the distribution used to compute inference statistics.
626      * @param value the new distribution
627      * @since 1.2
628      * @deprecated in 2.2 (to be removed in 3.0).
629      */
630     @Deprecated
setDistribution(TDistribution value)631     public void setDistribution(TDistribution value) {
632         distribution = value;
633 
634         // modify degrees of freedom
635         if (n > 2) {
636             distribution.setDegreesOfFreedom(n - 2);
637         }
638     }
639 }
640