001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.math3.stat.regression;
019 import java.io.Serializable;
020
021 import org.apache.commons.math3.exception.OutOfRangeException;
022 import org.apache.commons.math3.distribution.TDistribution;
023 import org.apache.commons.math3.exception.MathIllegalArgumentException;
024 import org.apache.commons.math3.exception.NoDataException;
025 import org.apache.commons.math3.exception.util.LocalizedFormats;
026 import org.apache.commons.math3.util.FastMath;
027 import org.apache.commons.math3.util.Precision;
028
029 /**
030 * Estimates an ordinary least squares regression model
031 * with one independent variable.
032 * <p>
033 * <code> y = intercept + slope * x </code></p>
034 * <p>
035 * Standard errors for <code>intercept</code> and <code>slope</code> are
036 * available as well as ANOVA, r-square and Pearson's r statistics.</p>
037 * <p>
038 * Observations (x,y pairs) can be added to the model one at a time or they
039 * can be provided in a 2-dimensional array. The observations are not stored
040 * in memory, so there is no limit to the number of observations that can be
041 * added to the model.</p>
042 * <p>
043 * <strong>Usage Notes</strong>: <ul>
044 * <li> When there are fewer than two observations in the model, or when
045 * there is no variation in the x values (i.e. all x values are the same)
046 * all statistics return <code>NaN</code>. At least two observations with
047 * different x coordinates are required to estimate a bivariate regression
048 * model.
049 * </li>
050 * <li> Getters for the statistics always compute values based on the current
051 * set of observations -- i.e., you can get statistics, then add more data
052 * and get updated statistics without using a new instance. There is no
053 * "compute" method that updates all statistics. Each of the getters performs
054 * the necessary computations to return the requested statistic.
055 * </li>
056 * <li> The intercept term may be suppressed by passing {@code false} to
057 * the {@link #SimpleRegression(boolean)} constructor. When the
058 * {@code hasIntercept} property is false, the model is estimated without a
059 * constant term and {@link #getIntercept()} returns {@code 0}.</li>
060 * </ul></p>
061 *
062 * @version $Id: SimpleRegression.java 1416643 2012-12-03 19:37:14Z tn $
063 */
064 public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression {
065
066 /** Serializable version identifier */
067 private static final long serialVersionUID = -3004689053607543335L;
068
069 /** sum of x values */
070 private double sumX = 0d;
071
072 /** total variation in x (sum of squared deviations from xbar) */
073 private double sumXX = 0d;
074
075 /** sum of y values */
076 private double sumY = 0d;
077
078 /** total variation in y (sum of squared deviations from ybar) */
079 private double sumYY = 0d;
080
081 /** sum of products */
082 private double sumXY = 0d;
083
084 /** number of observations */
085 private long n = 0;
086
087 /** mean of accumulated x values, used in updating formulas */
088 private double xbar = 0;
089
090 /** mean of accumulated y values, used in updating formulas */
091 private double ybar = 0;
092
093 /** include an intercept or not */
094 private final boolean hasIntercept;
095 // ---------------------Public methods--------------------------------------
096
097 /**
098 * Create an empty SimpleRegression instance
099 */
100 public SimpleRegression() {
101 this(true);
102 }
103 /**
104 * Create a SimpleRegression instance, specifying whether or not to estimate
105 * an intercept.
106 *
107 * <p>Use {@code false} to estimate a model with no intercept. When the
108 * {@code hasIntercept} property is false, the model is estimated without a
109 * constant term and {@link #getIntercept()} returns {@code 0}.</p>
110 *
111 * @param includeIntercept whether or not to include an intercept term in
112 * the regression model
113 */
114 public SimpleRegression(boolean includeIntercept) {
115 super();
116 hasIntercept = includeIntercept;
117 }
118
119 /**
120 * Adds the observation (x,y) to the regression data set.
121 * <p>
122 * Uses updating formulas for means and sums of squares defined in
123 * "Algorithms for Computing the Sample Variance: Analysis and
124 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
125 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
126 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
127 *
128 *
129 * @param x independent variable value
130 * @param y dependent variable value
131 */
132 public void addData(final double x,final double y) {
133 if (n == 0) {
134 xbar = x;
135 ybar = y;
136 } else {
137 if( hasIntercept ){
138 final double fact1 = 1.0 + n;
139 final double fact2 = n / (1.0 + n);
140 final double dx = x - xbar;
141 final double dy = y - ybar;
142 sumXX += dx * dx * fact2;
143 sumYY += dy * dy * fact2;
144 sumXY += dx * dy * fact2;
145 xbar += dx / fact1;
146 ybar += dy / fact1;
147 }
148 }
149 if( !hasIntercept ){
150 sumXX += x * x ;
151 sumYY += y * y ;
152 sumXY += x * y ;
153 }
154 sumX += x;
155 sumY += y;
156 n++;
157 }
158
159
160 /**
161 * Removes the observation (x,y) from the regression data set.
162 * <p>
163 * Mirrors the addData method. This method permits the use of
164 * SimpleRegression instances in streaming mode where the regression
165 * is applied to a sliding "window" of observations, however the caller is
166 * responsible for maintaining the set of observations in the window.</p>
167 *
168 * The method has no effect if there are no points of data (i.e. n=0)
169 *
170 * @param x independent variable value
171 * @param y dependent variable value
172 */
173 public void removeData(final double x,final double y) {
174 if (n > 0) {
175 if (hasIntercept) {
176 final double fact1 = n - 1.0;
177 final double fact2 = n / (n - 1.0);
178 final double dx = x - xbar;
179 final double dy = y - ybar;
180 sumXX -= dx * dx * fact2;
181 sumYY -= dy * dy * fact2;
182 sumXY -= dx * dy * fact2;
183 xbar -= dx / fact1;
184 ybar -= dy / fact1;
185 } else {
186 final double fact1 = n - 1.0;
187 sumXX -= x * x;
188 sumYY -= y * y;
189 sumXY -= x * y;
190 xbar -= x / fact1;
191 ybar -= y / fact1;
192 }
193 sumX -= x;
194 sumY -= y;
195 n--;
196 }
197 }
198
199 /**
200 * Adds the observations represented by the elements in
201 * <code>data</code>.
202 * <p>
203 * <code>(data[0][0],data[0][1])</code> will be the first observation, then
204 * <code>(data[1][0],data[1][1])</code>, etc.</p>
205 * <p>
206 * This method does not replace data that has already been added. The
207 * observations represented by <code>data</code> are added to the existing
208 * dataset.</p>
209 * <p>
210 * To replace all data, use <code>clear()</code> before adding the new
211 * data.</p>
212 *
213 * @param data array of observations to be added
214 * @throws ModelSpecificationException if the length of {@code data[i]} is not
215 * greater than or equal to 2
216 */
217 public void addData(final double[][] data) throws ModelSpecificationException {
218 for (int i = 0; i < data.length; i++) {
219 if( data[i].length < 2 ){
220 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,
221 data[i].length, 2);
222 }
223 addData(data[i][0], data[i][1]);
224 }
225 }
226
227 /**
228 * Adds one observation to the regression model.
229 *
230 * @param x the independent variables which form the design matrix
231 * @param y the dependent or response variable
232 * @throws ModelSpecificationException if the length of {@code x} does not equal
233 * the number of independent variables in the model
234 */
235 public void addObservation(final double[] x,final double y)
236 throws ModelSpecificationException {
237 if( x == null || x.length == 0 ){
238 throw new ModelSpecificationException(LocalizedFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
239 }
240 addData( x[0], y );
241 }
242
243 /**
244 * Adds a series of observations to the regression model. The lengths of
245 * x and y must be the same and x must be rectangular.
246 *
247 * @param x a series of observations on the independent variables
248 * @param y a series of observations on the dependent variable
249 * The length of x and y must be the same
250 * @throws ModelSpecificationException if {@code x} is not rectangular, does not match
251 * the length of {@code y} or does not contain sufficient data to estimate the model
252 */
253 public void addObservations(final double[][] x,final double[] y) throws ModelSpecificationException {
254 if ((x == null) || (y == null) || (x.length != y.length)) {
255 throw new ModelSpecificationException(
256 LocalizedFormats.DIMENSIONS_MISMATCH_SIMPLE,
257 (x == null) ? 0 : x.length,
258 (y == null) ? 0 : y.length);
259 }
260 boolean obsOk=true;
261 for( int i = 0 ; i < x.length; i++){
262 if( x[i] == null || x[i].length == 0 ){
263 obsOk = false;
264 }
265 }
266 if( !obsOk ){
267 throw new ModelSpecificationException(
268 LocalizedFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
269 0, 1);
270 }
271 for( int i = 0 ; i < x.length ; i++){
272 addData( x[i][0], y[i] );
273 }
274 }
275
276 /**
277 * Removes observations represented by the elements in <code>data</code>.
278 * <p>
279 * If the array is larger than the current n, only the first n elements are
280 * processed. This method permits the use of SimpleRegression instances in
281 * streaming mode where the regression is applied to a sliding "window" of
282 * observations, however the caller is responsible for maintaining the set
283 * of observations in the window.</p>
284 * <p>
285 * To remove all data, use <code>clear()</code>.</p>
286 *
287 * @param data array of observations to be removed
288 */
289 public void removeData(double[][] data) {
290 for (int i = 0; i < data.length && n > 0; i++) {
291 removeData(data[i][0], data[i][1]);
292 }
293 }
294
295 /**
296 * Clears all data from the model.
297 */
298 public void clear() {
299 sumX = 0d;
300 sumXX = 0d;
301 sumY = 0d;
302 sumYY = 0d;
303 sumXY = 0d;
304 n = 0;
305 }
306
307 /**
308 * Returns the number of observations that have been added to the model.
309 *
310 * @return n number of observations that have been added.
311 */
312 public long getN() {
313 return n;
314 }
315
316 /**
317 * Returns the "predicted" <code>y</code> value associated with the
318 * supplied <code>x</code> value, based on the data that has been
319 * added to the model when this method is activated.
320 * <p>
321 * <code> predict(x) = intercept + slope * x </code></p>
322 * <p>
323 * <strong>Preconditions</strong>: <ul>
324 * <li>At least two observations (with at least two different x values)
325 * must have been added before invoking this method. If this method is
326 * invoked before a model can be estimated, <code>Double,NaN</code> is
327 * returned.
328 * </li></ul></p>
329 *
330 * @param x input <code>x</code> value
331 * @return predicted <code>y</code> value
332 */
333 public double predict(final double x) {
334 final double b1 = getSlope();
335 if (hasIntercept) {
336 return getIntercept(b1) + b1 * x;
337 }
338 return b1 * x;
339 }
340
341 /**
342 * Returns the intercept of the estimated regression line, if
343 * {@link #hasIntercept()} is true; otherwise 0.
344 * <p>
345 * The least squares estimate of the intercept is computed using the
346 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
347 * The intercept is sometimes denoted b0.</p>
348 * <p>
349 * <strong>Preconditions</strong>: <ul>
350 * <li>At least two observations (with at least two different x values)
351 * must have been added before invoking this method. If this method is
352 * invoked before a model can be estimated, <code>Double,NaN</code> is
353 * returned.
354 * </li></ul></p>
355 *
356 * @return the intercept of the regression line if the model includes an
357 * intercept; 0 otherwise
358 * @see #SimpleRegression(boolean)
359 */
360 public double getIntercept() {
361 return hasIntercept ? getIntercept(getSlope()) : 0.0;
362 }
363
364 /**
365 * Returns true if the model includes an intercept term.
366 *
367 * @return true if the regression includes an intercept; false otherwise
368 * @see #SimpleRegression(boolean)
369 */
370 public boolean hasIntercept() {
371 return hasIntercept;
372 }
373
374 /**
375 * Returns the slope of the estimated regression line.
376 * <p>
377 * The least squares estimate of the slope is computed using the
378 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
379 * The slope is sometimes denoted b1.</p>
380 * <p>
381 * <strong>Preconditions</strong>: <ul>
382 * <li>At least two observations (with at least two different x values)
383 * must have been added before invoking this method. If this method is
384 * invoked before a model can be estimated, <code>Double.NaN</code> is
385 * returned.
386 * </li></ul></p>
387 *
388 * @return the slope of the regression line
389 */
390 public double getSlope() {
391 if (n < 2) {
392 return Double.NaN; //not enough data
393 }
394 if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
395 return Double.NaN; //not enough variation in x
396 }
397 return sumXY / sumXX;
398 }
399
400 /**
401 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
402 * sum of squared errors</a> (SSE) associated with the regression
403 * model.
404 * <p>
405 * The sum is computed using the computational formula</p>
406 * <p>
407 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
408 * <p>
409 * where <code>SYY</code> is the sum of the squared deviations of the y
410 * values about their mean, <code>SXX</code> is similarly defined and
411 * <code>SXY</code> is the sum of the products of x and y mean deviations.
412 * </p><p>
413 * The sums are accumulated using the updating algorithm referenced in
414 * {@link #addData}.</p>
415 * <p>
416 * The return value is constrained to be non-negative - i.e., if due to
417 * rounding errors the computational formula returns a negative result,
418 * 0 is returned.</p>
419 * <p>
420 * <strong>Preconditions</strong>: <ul>
421 * <li>At least two observations (with at least two different x values)
422 * must have been added before invoking this method. If this method is
423 * invoked before a model can be estimated, <code>Double,NaN</code> is
424 * returned.
425 * </li></ul></p>
426 *
427 * @return sum of squared errors associated with the regression model
428 */
429 public double getSumSquaredErrors() {
430 return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX);
431 }
432
433 /**
434 * Returns the sum of squared deviations of the y values about their mean.
435 * <p>
436 * This is defined as SSTO
437 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
438 * <p>
439 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
440 *
441 * @return sum of squared deviations of y values
442 */
443 public double getTotalSumSquares() {
444 if (n < 2) {
445 return Double.NaN;
446 }
447 return sumYY;
448 }
449
450 /**
451 * Returns the sum of squared deviations of the x values about their mean.
452 *
453 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
454 *
455 * @return sum of squared deviations of x values
456 */
457 public double getXSumSquares() {
458 if (n < 2) {
459 return Double.NaN;
460 }
461 return sumXX;
462 }
463
464 /**
465 * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
466 *
467 * @return sum of cross products
468 */
469 public double getSumOfCrossProducts() {
470 return sumXY;
471 }
472
473 /**
474 * Returns the sum of squared deviations of the predicted y values about
475 * their mean (which equals the mean of y).
476 * <p>
477 * This is usually abbreviated SSR or SSM. It is defined as SSM
478 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
479 * <p>
480 * <strong>Preconditions</strong>: <ul>
481 * <li>At least two observations (with at least two different x values)
482 * must have been added before invoking this method. If this method is
483 * invoked before a model can be estimated, <code>Double.NaN</code> is
484 * returned.
485 * </li></ul></p>
486 *
487 * @return sum of squared deviations of predicted y values
488 */
489 public double getRegressionSumSquares() {
490 return getRegressionSumSquares(getSlope());
491 }
492
493 /**
494 * Returns the sum of squared errors divided by the degrees of freedom,
495 * usually abbreviated MSE.
496 * <p>
497 * If there are fewer than <strong>three</strong> data pairs in the model,
498 * or if there is no variation in <code>x</code>, this returns
499 * <code>Double.NaN</code>.</p>
500 *
501 * @return sum of squared deviations of y values
502 */
503 public double getMeanSquareError() {
504 if (n < 3) {
505 return Double.NaN;
506 }
507 return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
508 }
509
510 /**
511 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
512 * Pearson's product moment correlation coefficient</a>,
513 * usually denoted r.
514 * <p>
515 * <strong>Preconditions</strong>: <ul>
516 * <li>At least two observations (with at least two different x values)
517 * must have been added before invoking this method. If this method is
518 * invoked before a model can be estimated, <code>Double,NaN</code> is
519 * returned.
520 * </li></ul></p>
521 *
522 * @return Pearson's r
523 */
524 public double getR() {
525 double b1 = getSlope();
526 double result = FastMath.sqrt(getRSquare());
527 if (b1 < 0) {
528 result = -result;
529 }
530 return result;
531 }
532
533 /**
534 * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
535 * coefficient of determination</a>,
536 * usually denoted r-square.
537 * <p>
538 * <strong>Preconditions</strong>: <ul>
539 * <li>At least two observations (with at least two different x values)
540 * must have been added before invoking this method. If this method is
541 * invoked before a model can be estimated, <code>Double,NaN</code> is
542 * returned.
543 * </li></ul></p>
544 *
545 * @return r-square
546 */
547 public double getRSquare() {
548 double ssto = getTotalSumSquares();
549 return (ssto - getSumSquaredErrors()) / ssto;
550 }
551
552 /**
553 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
554 * standard error of the intercept estimate</a>,
555 * usually denoted s(b0).
556 * <p>
557 * If there are fewer that <strong>three</strong> observations in the
558 * model, or if there is no variation in x, this returns
559 * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
560 * returned when the intercept is constrained to be zero
561 *
562 * @return standard error associated with intercept estimate
563 */
564 public double getInterceptStdErr() {
565 if( !hasIntercept ){
566 return Double.NaN;
567 }
568 return FastMath.sqrt(
569 getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
570 }
571
572 /**
573 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
574 * error of the slope estimate</a>,
575 * usually denoted s(b1).
576 * <p>
577 * If there are fewer that <strong>three</strong> data pairs in the model,
578 * or if there is no variation in x, this returns <code>Double.NaN</code>.
579 * </p>
580 *
581 * @return standard error associated with slope estimate
582 */
583 public double getSlopeStdErr() {
584 return FastMath.sqrt(getMeanSquareError() / sumXX);
585 }
586
587 /**
588 * Returns the half-width of a 95% confidence interval for the slope
589 * estimate.
590 * <p>
591 * The 95% confidence interval is</p>
592 * <p>
593 * <code>(getSlope() - getSlopeConfidenceInterval(),
594 * getSlope() + getSlopeConfidenceInterval())</code></p>
595 * <p>
596 * If there are fewer that <strong>three</strong> observations in the
597 * model, or if there is no variation in x, this returns
598 * <code>Double.NaN</code>.</p>
599 * <p>
600 * <strong>Usage Note</strong>:<br>
601 * The validity of this statistic depends on the assumption that the
602 * observations included in the model are drawn from a
603 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
604 * Bivariate Normal Distribution</a>.</p>
605 *
606 * @return half-width of 95% confidence interval for the slope estimate
607 * @throws OutOfRangeException if the confidence interval can not be computed.
608 */
609 public double getSlopeConfidenceInterval() throws OutOfRangeException {
610 return getSlopeConfidenceInterval(0.05d);
611 }
612
613 /**
614 * Returns the half-width of a (100-100*alpha)% confidence interval for
615 * the slope estimate.
616 * <p>
617 * The (100-100*alpha)% confidence interval is </p>
618 * <p>
619 * <code>(getSlope() - getSlopeConfidenceInterval(),
620 * getSlope() + getSlopeConfidenceInterval())</code></p>
621 * <p>
622 * To request, for example, a 99% confidence interval, use
623 * <code>alpha = .01</code></p>
624 * <p>
625 * <strong>Usage Note</strong>:<br>
626 * The validity of this statistic depends on the assumption that the
627 * observations included in the model are drawn from a
628 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
629 * Bivariate Normal Distribution</a>.</p>
630 * <p>
631 * <strong> Preconditions:</strong><ul>
632 * <li>If there are fewer that <strong>three</strong> observations in the
633 * model, or if there is no variation in x, this returns
634 * <code>Double.NaN</code>.
635 * </li>
636 * <li><code>(0 < alpha < 1)</code>; otherwise an
637 * <code>OutOfRangeException</code> is thrown.
638 * </li></ul></p>
639 *
640 * @param alpha the desired significance level
641 * @return half-width of 95% confidence interval for the slope estimate
642 * @throws OutOfRangeException if the confidence interval can not be computed.
643 */
644 public double getSlopeConfidenceInterval(final double alpha)
645 throws OutOfRangeException {
646 if (n < 3) {
647 return Double.NaN;
648 }
649 if (alpha >= 1 || alpha <= 0) {
650 throw new OutOfRangeException(LocalizedFormats.SIGNIFICANCE_LEVEL,
651 alpha, 0, 1);
652 }
653 // No advertised NotStrictlyPositiveException here - will return NaN above
654 TDistribution distribution = new TDistribution(n - 2);
655 return getSlopeStdErr() *
656 distribution.inverseCumulativeProbability(1d - alpha / 2d);
657 }
658
659 /**
660 * Returns the significance level of the slope (equiv) correlation.
661 * <p>
662 * Specifically, the returned value is the smallest <code>alpha</code>
663 * such that the slope confidence interval with significance level
664 * equal to <code>alpha</code> does not include <code>0</code>.
665 * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
666 * </p><p>
667 * <strong>Usage Note</strong>:<br>
668 * The validity of this statistic depends on the assumption that the
669 * observations included in the model are drawn from a
670 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
671 * Bivariate Normal Distribution</a>.</p>
672 * <p>
673 * If there are fewer that <strong>three</strong> observations in the
674 * model, or if there is no variation in x, this returns
675 * <code>Double.NaN</code>.</p>
676 *
677 * @return significance level for slope/correlation
678 * @throws org.apache.commons.math3.exception.MaxCountExceededException
679 * if the significance level can not be computed.
680 */
681 public double getSignificance() {
682 if (n < 3) {
683 return Double.NaN;
684 }
685 // No advertised NotStrictlyPositiveException here - will return NaN above
686 TDistribution distribution = new TDistribution(n - 2);
687 return 2d * (1.0 - distribution.cumulativeProbability(
688 FastMath.abs(getSlope()) / getSlopeStdErr()));
689 }
690
691 // ---------------------Private methods-----------------------------------
692
693 /**
694 * Returns the intercept of the estimated regression line, given the slope.
695 * <p>
696 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
697 *
698 * @param slope current slope
699 * @return the intercept of the regression line
700 */
701 private double getIntercept(final double slope) {
702 if( hasIntercept){
703 return (sumY - slope * sumX) / n;
704 }
705 return 0.0;
706 }
707
708 /**
709 * Computes SSR from b1.
710 *
711 * @param slope regression slope estimate
712 * @return sum of squared deviations of predicted y values
713 */
714 private double getRegressionSumSquares(final double slope) {
715 return slope * slope * sumXX;
716 }
717
718 /**
719 * Performs a regression on data present in buffers and outputs a RegressionResults object.
720 *
721 * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
722 * a {@code NoDataException} is thrown. If there is no intercept term, the model must
723 * contain at least 2 observations.</p>
724 *
725 * @return RegressionResults acts as a container of regression output
726 * @throws ModelSpecificationException if the model is not correctly specified
727 * @throws NoDataException if there is not sufficient data in the model to
728 * estimate the regression parameters
729 */
730 public RegressionResults regress() throws ModelSpecificationException, NoDataException {
731 if (hasIntercept) {
732 if( n < 3 ){
733 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
734 }
735 if( FastMath.abs( sumXX ) > Precision.SAFE_MIN ){
736 final double[] params = new double[]{ getIntercept(), getSlope() };
737 final double mse = getMeanSquareError();
738 final double _syy = sumYY + sumY * sumY / n;
739 final double[] vcv = new double[]{
740 mse * (xbar *xbar /sumXX + 1.0 / n),
741 -xbar*mse/sumXX,
742 mse/sumXX };
743 return new RegressionResults(
744 params, new double[][]{vcv}, true, n, 2,
745 sumY, _syy, getSumSquaredErrors(),true,false);
746 }else{
747 final double[] params = new double[]{ sumY / n, Double.NaN };
748 //final double mse = getMeanSquareError();
749 final double[] vcv = new double[]{
750 ybar / (n - 1.0),
751 Double.NaN,
752 Double.NaN };
753 return new RegressionResults(
754 params, new double[][]{vcv}, true, n, 1,
755 sumY, sumYY, getSumSquaredErrors(),true,false);
756 }
757 }else{
758 if (n < 2) {
759 throw new NoDataException(LocalizedFormats.NOT_ENOUGH_DATA_REGRESSION);
760 }
761 if( !Double.isNaN(sumXX) ){
762 final double[] vcv = new double[]{ getMeanSquareError() / sumXX };
763 final double[] params = new double[]{ sumXY/sumXX };
764 return new RegressionResults(
765 params, new double[][]{vcv}, true, n, 1,
766 sumY, sumYY, getSumSquaredErrors(),false,false);
767 }else{
768 final double[] vcv = new double[]{Double.NaN };
769 final double[] params = new double[]{ Double.NaN };
770 return new RegressionResults(
771 params, new double[][]{vcv}, true, n, 1,
772 Double.NaN, Double.NaN, Double.NaN,false,false);
773 }
774 }
775 }
776
777 /**
778 * Performs a regression on data present in buffers including only regressors
779 * indexed in variablesToInclude and outputs a RegressionResults object
780 * @param variablesToInclude an array of indices of regressors to include
781 * @return RegressionResults acts as a container of regression output
782 * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
783 * @throws OutOfRangeException if a requested variable is not present in model
784 */
785 public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException{
786 if( variablesToInclude == null || variablesToInclude.length == 0){
787 throw new MathIllegalArgumentException(LocalizedFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
788 }
789 if( variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept) ){
790 throw new ModelSpecificationException(
791 LocalizedFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
792 (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
793 }
794
795 if( hasIntercept ){
796 if( variablesToInclude.length == 2 ){
797 if( variablesToInclude[0] == 1 ){
798 throw new ModelSpecificationException(LocalizedFormats.NOT_INCREASING_SEQUENCE);
799 }else if( variablesToInclude[0] != 0 ){
800 throw new OutOfRangeException( variablesToInclude[0], 0,1 );
801 }
802 if( variablesToInclude[1] != 1){
803 throw new OutOfRangeException( variablesToInclude[0], 0,1 );
804 }
805 return regress();
806 }else{
807 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ){
808 throw new OutOfRangeException( variablesToInclude[0],0,1 );
809 }
810 final double _mean = sumY * sumY / n;
811 final double _syy = sumYY + _mean;
812 if( variablesToInclude[0] == 0 ){
813 //just the mean
814 final double[] vcv = new double[]{ sumYY/(((n-1)*n)) };
815 final double[] params = new double[]{ ybar };
816 return new RegressionResults(
817 params, new double[][]{vcv}, true, n, 1,
818 sumY, _syy+_mean, sumYY,true,false);
819
820 }else if( variablesToInclude[0] == 1){
821 //final double _syy = sumYY + sumY * sumY / ((double) n);
822 final double _sxx = sumXX + sumX * sumX / n;
823 final double _sxy = sumXY + sumX * sumY / n;
824 final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx);
825 final double _mse = _sse/((n-1));
826 if( !Double.isNaN(_sxx) ){
827 final double[] vcv = new double[]{ _mse / _sxx };
828 final double[] params = new double[]{ _sxy/_sxx };
829 return new RegressionResults(
830 params, new double[][]{vcv}, true, n, 1,
831 sumY, _syy, _sse,false,false);
832 }else{
833 final double[] vcv = new double[]{Double.NaN };
834 final double[] params = new double[]{ Double.NaN };
835 return new RegressionResults(
836 params, new double[][]{vcv}, true, n, 1,
837 Double.NaN, Double.NaN, Double.NaN,false,false);
838 }
839 }
840 }
841 }else{
842 if( variablesToInclude[0] != 0 ){
843 throw new OutOfRangeException(variablesToInclude[0],0,0);
844 }
845 return regress();
846 }
847
848 return null;
849 }
850 }