001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.math3.stat.descriptive.moment;
018
019 import java.io.Serializable;
020
021 import org.apache.commons.math3.exception.MathIllegalArgumentException;
022 import org.apache.commons.math3.exception.NullArgumentException;
023 import org.apache.commons.math3.exception.util.LocalizedFormats;
024 import org.apache.commons.math3.stat.descriptive.WeightedEvaluation;
025 import org.apache.commons.math3.stat.descriptive.AbstractStorelessUnivariateStatistic;
026 import org.apache.commons.math3.util.MathUtils;
027
028 /**
029 * Computes the variance of the available values. By default, the unbiased
030 * "sample variance" definitional formula is used:
031 * <p>
032 * variance = sum((x_i - mean)^2) / (n - 1) </p>
033 * <p>
034 * where mean is the {@link Mean} and <code>n</code> is the number
035 * of sample observations.</p>
036 * <p>
037 * The definitional formula does not have good numerical properties, so
038 * this implementation does not compute the statistic using the definitional
039 * formula. <ul>
040 * <li> The <code>getResult</code> method computes the variance using
041 * updating formulas based on West's algorithm, as described in
042 * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
043 * J. G. Lewis 1979, <i>Communications of the ACM</i>,
044 * vol. 22 no. 9, pp. 526-531.</a></li>
045 * <li> The <code>evaluate</code> methods leverage the fact that they have the
046 * full array of values in memory to execute a two-pass algorithm.
047 * Specifically, these methods use the "corrected two-pass algorithm" from
048 * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
049 * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
050 * Note that adding values using <code>increment</code> or
051 * <code>incrementAll</code> and then executing <code>getResult</code> will
052 * sometimes give a different, less accurate, result than executing
053 * <code>evaluate</code> with the full array of values. The former approach
054 * should only be used when the full array of values is not available.</p>
055 * <p>
056 * The "population variance" ( sum((x_i - mean)^2) / n ) can also
057 * be computed using this statistic. The <code>isBiasCorrected</code>
058 * property determines whether the "population" or "sample" value is
059 * returned by the <code>evaluate</code> and <code>getResult</code> methods.
060 * To compute population variances, set this property to <code>false.</code>
061 * </p>
062 * <p>
063 * <strong>Note that this implementation is not synchronized.</strong> If
064 * multiple threads access an instance of this class concurrently, and at least
065 * one of the threads invokes the <code>increment()</code> or
066 * <code>clear()</code> method, it must be synchronized externally.</p>
067 *
068 * @version $Id: Variance.java 1416643 2012-12-03 19:37:14Z tn $
069 */
070 public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation {
071
072 /** Serializable version identifier */
073 private static final long serialVersionUID = -9111962718267217978L;
074
075 /** SecondMoment is used in incremental calculation of Variance*/
076 protected SecondMoment moment = null;
077
078 /**
079 * Whether or not {@link #increment(double)} should increment
080 * the internal second moment. When a Variance is constructed with an
081 * external SecondMoment as a constructor parameter, this property is
082 * set to false and increments must be applied to the second moment
083 * directly.
084 */
085 protected boolean incMoment = true;
086
087 /**
088 * Whether or not bias correction is applied when computing the
089 * value of the statistic. True means that bias is corrected. See
090 * {@link Variance} for details on the formula.
091 */
092 private boolean isBiasCorrected = true;
093
094 /**
095 * Constructs a Variance with default (true) <code>isBiasCorrected</code>
096 * property.
097 */
098 public Variance() {
099 moment = new SecondMoment();
100 }
101
102 /**
103 * Constructs a Variance based on an external second moment.
104 * When this constructor is used, the statistic may only be
105 * incremented via the moment, i.e., {@link #increment(double)}
106 * does nothing; whereas {@code m2.increment(value)} increments
107 * both {@code m2} and the Variance instance constructed from it.
108 *
109 * @param m2 the SecondMoment (Third or Fourth moments work
110 * here as well.)
111 */
112 public Variance(final SecondMoment m2) {
113 incMoment = false;
114 this.moment = m2;
115 }
116
117 /**
118 * Constructs a Variance with the specified <code>isBiasCorrected</code>
119 * property
120 *
121 * @param isBiasCorrected setting for bias correction - true means
122 * bias will be corrected and is equivalent to using the argumentless
123 * constructor
124 */
125 public Variance(boolean isBiasCorrected) {
126 moment = new SecondMoment();
127 this.isBiasCorrected = isBiasCorrected;
128 }
129
130 /**
131 * Constructs a Variance with the specified <code>isBiasCorrected</code>
132 * property and the supplied external second moment.
133 *
134 * @param isBiasCorrected setting for bias correction - true means
135 * bias will be corrected
136 * @param m2 the SecondMoment (Third or Fourth moments work
137 * here as well.)
138 */
139 public Variance(boolean isBiasCorrected, SecondMoment m2) {
140 incMoment = false;
141 this.moment = m2;
142 this.isBiasCorrected = isBiasCorrected;
143 }
144
145 /**
146 * Copy constructor, creates a new {@code Variance} identical
147 * to the {@code original}
148 *
149 * @param original the {@code Variance} instance to copy
150 * @throws NullArgumentException if original is null
151 */
152 public Variance(Variance original) throws NullArgumentException {
153 copy(original, this);
154 }
155
156 /**
157 * {@inheritDoc}
158 * <p>If all values are available, it is more accurate to use
159 * {@link #evaluate(double[])} rather than adding values one at a time
160 * using this method and then executing {@link #getResult}, since
161 * <code>evaluate</code> leverages the fact that is has the full
162 * list of values together to execute a two-pass algorithm.
163 * See {@link Variance}.</p>
164 *
165 * <p>Note also that when {@link #Variance(SecondMoment)} is used to
166 * create a Variance, this method does nothing. In that case, the
167 * SecondMoment should be incremented directly.</p>
168 */
169 @Override
170 public void increment(final double d) {
171 if (incMoment) {
172 moment.increment(d);
173 }
174 }
175
176 /**
177 * {@inheritDoc}
178 */
179 @Override
180 public double getResult() {
181 if (moment.n == 0) {
182 return Double.NaN;
183 } else if (moment.n == 1) {
184 return 0d;
185 } else {
186 if (isBiasCorrected) {
187 return moment.m2 / (moment.n - 1d);
188 } else {
189 return moment.m2 / (moment.n);
190 }
191 }
192 }
193
194 /**
195 * {@inheritDoc}
196 */
197 public long getN() {
198 return moment.getN();
199 }
200
201 /**
202 * {@inheritDoc}
203 */
204 @Override
205 public void clear() {
206 if (incMoment) {
207 moment.clear();
208 }
209 }
210
211 /**
212 * Returns the variance of the entries in the input array, or
213 * <code>Double.NaN</code> if the array is empty.
214 * <p>
215 * See {@link Variance} for details on the computing algorithm.</p>
216 * <p>
217 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
218 * <p>
219 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
220 * <p>
221 * Does not change the internal state of the statistic.</p>
222 *
223 * @param values the input array
224 * @return the variance of the values or Double.NaN if length = 0
225 * @throws MathIllegalArgumentException if the array is null
226 */
227 @Override
228 public double evaluate(final double[] values) throws MathIllegalArgumentException {
229 if (values == null) {
230 throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
231 }
232 return evaluate(values, 0, values.length);
233 }
234
235 /**
236 * Returns the variance of the entries in the specified portion of
237 * the input array, or <code>Double.NaN</code> if the designated subarray
238 * is empty.
239 * <p>
240 * See {@link Variance} for details on the computing algorithm.</p>
241 * <p>
242 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
243 * <p>
244 * Does not change the internal state of the statistic.</p>
245 * <p>
246 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
247 *
248 * @param values the input array
249 * @param begin index of the first array element to include
250 * @param length the number of elements to include
251 * @return the variance of the values or Double.NaN if length = 0
252 * @throws MathIllegalArgumentException if the array is null or the array index
253 * parameters are not valid
254 */
255 @Override
256 public double evaluate(final double[] values, final int begin, final int length)
257 throws MathIllegalArgumentException {
258
259 double var = Double.NaN;
260
261 if (test(values, begin, length)) {
262 clear();
263 if (length == 1) {
264 var = 0.0;
265 } else if (length > 1) {
266 Mean mean = new Mean();
267 double m = mean.evaluate(values, begin, length);
268 var = evaluate(values, m, begin, length);
269 }
270 }
271 return var;
272 }
273
274 /**
275 * <p>Returns the weighted variance of the entries in the specified portion of
276 * the input array, or <code>Double.NaN</code> if the designated subarray
277 * is empty.</p>
278 * <p>
279 * Uses the formula <pre>
280 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1)
281 * </pre>
282 * where weightedMean is the weighted mean</p>
283 * <p>
284 * This formula will not return the same result as the unweighted variance when all
285 * weights are equal, unless all weights are equal to 1. The formula assumes that
286 * weights are to be treated as "expansion values," as will be the case if for example
287 * the weights represent frequency counts. To normalize weights so that the denominator
288 * in the variance computation equals the length of the input vector minus one, use <pre>
289 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
290 * </pre>
291 * <p>
292 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
293 * <p>
294 * Throws <code>IllegalArgumentException</code> if any of the following are true:
295 * <ul><li>the values array is null</li>
296 * <li>the weights array is null</li>
297 * <li>the weights array does not have the same length as the values array</li>
298 * <li>the weights array contains one or more infinite values</li>
299 * <li>the weights array contains one or more NaN values</li>
300 * <li>the weights array contains negative values</li>
301 * <li>the start and length arguments do not determine a valid array</li>
302 * </ul></p>
303 * <p>
304 * Does not change the internal state of the statistic.</p>
305 * <p>
306 * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
307 *
308 * @param values the input array
309 * @param weights the weights array
310 * @param begin index of the first array element to include
311 * @param length the number of elements to include
312 * @return the weighted variance of the values or Double.NaN if length = 0
313 * @throws MathIllegalArgumentException if the parameters are not valid
314 * @since 2.1
315 */
316 public double evaluate(final double[] values, final double[] weights,
317 final int begin, final int length) throws MathIllegalArgumentException {
318
319 double var = Double.NaN;
320
321 if (test(values, weights,begin, length)) {
322 clear();
323 if (length == 1) {
324 var = 0.0;
325 } else if (length > 1) {
326 Mean mean = new Mean();
327 double m = mean.evaluate(values, weights, begin, length);
328 var = evaluate(values, weights, m, begin, length);
329 }
330 }
331 return var;
332 }
333
334 /**
335 * <p>
336 * Returns the weighted variance of the entries in the the input array.</p>
337 * <p>
338 * Uses the formula <pre>
339 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1)
340 * </pre>
341 * where weightedMean is the weighted mean</p>
342 * <p>
343 * This formula will not return the same result as the unweighted variance when all
344 * weights are equal, unless all weights are equal to 1. The formula assumes that
345 * weights are to be treated as "expansion values," as will be the case if for example
346 * the weights represent frequency counts. To normalize weights so that the denominator
347 * in the variance computation equals the length of the input vector minus one, use <pre>
348 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length)); </code>
349 * </pre>
350 * <p>
351 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
352 * <p>
353 * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
354 * <ul><li>the values array is null</li>
355 * <li>the weights array is null</li>
356 * <li>the weights array does not have the same length as the values array</li>
357 * <li>the weights array contains one or more infinite values</li>
358 * <li>the weights array contains one or more NaN values</li>
359 * <li>the weights array contains negative values</li>
360 * </ul></p>
361 * <p>
362 * Does not change the internal state of the statistic.</p>
363 * <p>
364 * Throws <code>MathIllegalArgumentException</code> if either array is null.</p>
365 *
366 * @param values the input array
367 * @param weights the weights array
368 * @return the weighted variance of the values
369 * @throws MathIllegalArgumentException if the parameters are not valid
370 * @since 2.1
371 */
372 public double evaluate(final double[] values, final double[] weights)
373 throws MathIllegalArgumentException {
374 return evaluate(values, weights, 0, values.length);
375 }
376
377 /**
378 * Returns the variance of the entries in the specified portion of
379 * the input array, using the precomputed mean value. Returns
380 * <code>Double.NaN</code> if the designated subarray is empty.
381 * <p>
382 * See {@link Variance} for details on the computing algorithm.</p>
383 * <p>
384 * The formula used assumes that the supplied mean value is the arithmetic
385 * mean of the sample data, not a known population parameter. This method
386 * is supplied only to save computation when the mean has already been
387 * computed.</p>
388 * <p>
389 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
390 * <p>
391 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
392 * <p>
393 * Does not change the internal state of the statistic.</p>
394 *
395 * @param values the input array
396 * @param mean the precomputed mean value
397 * @param begin index of the first array element to include
398 * @param length the number of elements to include
399 * @return the variance of the values or Double.NaN if length = 0
400 * @throws MathIllegalArgumentException if the array is null or the array index
401 * parameters are not valid
402 */
403 public double evaluate(final double[] values, final double mean,
404 final int begin, final int length) throws MathIllegalArgumentException {
405
406 double var = Double.NaN;
407
408 if (test(values, begin, length)) {
409 if (length == 1) {
410 var = 0.0;
411 } else if (length > 1) {
412 double accum = 0.0;
413 double dev = 0.0;
414 double accum2 = 0.0;
415 for (int i = begin; i < begin + length; i++) {
416 dev = values[i] - mean;
417 accum += dev * dev;
418 accum2 += dev;
419 }
420 double len = length;
421 if (isBiasCorrected) {
422 var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
423 } else {
424 var = (accum - (accum2 * accum2 / len)) / len;
425 }
426 }
427 }
428 return var;
429 }
430
431 /**
432 * Returns the variance of the entries in the input array, using the
433 * precomputed mean value. Returns <code>Double.NaN</code> if the array
434 * is empty.
435 * <p>
436 * See {@link Variance} for details on the computing algorithm.</p>
437 * <p>
438 * If <code>isBiasCorrected</code> is <code>true</code> the formula used
439 * assumes that the supplied mean value is the arithmetic mean of the
440 * sample data, not a known population parameter. If the mean is a known
441 * population parameter, or if the "population" version of the variance is
442 * desired, set <code>isBiasCorrected</code> to <code>false</code> before
443 * invoking this method.</p>
444 * <p>
445 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
446 * <p>
447 * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
448 * <p>
449 * Does not change the internal state of the statistic.</p>
450 *
451 * @param values the input array
452 * @param mean the precomputed mean value
453 * @return the variance of the values or Double.NaN if the array is empty
454 * @throws MathIllegalArgumentException if the array is null
455 */
456 public double evaluate(final double[] values, final double mean) throws MathIllegalArgumentException {
457 return evaluate(values, mean, 0, values.length);
458 }
459
460 /**
461 * Returns the weighted variance of the entries in the specified portion of
462 * the input array, using the precomputed weighted mean value. Returns
463 * <code>Double.NaN</code> if the designated subarray is empty.
464 * <p>
465 * Uses the formula <pre>
466 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1)
467 * </pre></p>
468 * <p>
469 * The formula used assumes that the supplied mean value is the weighted arithmetic
470 * mean of the sample data, not a known population parameter. This method
471 * is supplied only to save computation when the mean has already been
472 * computed.</p>
473 * <p>
474 * This formula will not return the same result as the unweighted variance when all
475 * weights are equal, unless all weights are equal to 1. The formula assumes that
476 * weights are to be treated as "expansion values," as will be the case if for example
477 * the weights represent frequency counts. To normalize weights so that the denominator
478 * in the variance computation equals the length of the input vector minus one, use <pre>
479 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
480 * </pre>
481 * <p>
482 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
483 * <p>
484 * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
485 * <ul><li>the values array is null</li>
486 * <li>the weights array is null</li>
487 * <li>the weights array does not have the same length as the values array</li>
488 * <li>the weights array contains one or more infinite values</li>
489 * <li>the weights array contains one or more NaN values</li>
490 * <li>the weights array contains negative values</li>
491 * <li>the start and length arguments do not determine a valid array</li>
492 * </ul></p>
493 * <p>
494 * Does not change the internal state of the statistic.</p>
495 *
496 * @param values the input array
497 * @param weights the weights array
498 * @param mean the precomputed weighted mean value
499 * @param begin index of the first array element to include
500 * @param length the number of elements to include
501 * @return the variance of the values or Double.NaN if length = 0
502 * @throws MathIllegalArgumentException if the parameters are not valid
503 * @since 2.1
504 */
505 public double evaluate(final double[] values, final double[] weights,
506 final double mean, final int begin, final int length)
507 throws MathIllegalArgumentException {
508
509 double var = Double.NaN;
510
511 if (test(values, weights, begin, length)) {
512 if (length == 1) {
513 var = 0.0;
514 } else if (length > 1) {
515 double accum = 0.0;
516 double dev = 0.0;
517 double accum2 = 0.0;
518 for (int i = begin; i < begin + length; i++) {
519 dev = values[i] - mean;
520 accum += weights[i] * (dev * dev);
521 accum2 += weights[i] * dev;
522 }
523
524 double sumWts = 0;
525 for (int i = begin; i < begin + length; i++) {
526 sumWts += weights[i];
527 }
528
529 if (isBiasCorrected) {
530 var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
531 } else {
532 var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
533 }
534 }
535 }
536 return var;
537 }
538
539 /**
540 * <p>Returns the weighted variance of the values in the input array, using
541 * the precomputed weighted mean value.</p>
542 * <p>
543 * Uses the formula <pre>
544 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1)
545 * </pre></p>
546 * <p>
547 * The formula used assumes that the supplied mean value is the weighted arithmetic
548 * mean of the sample data, not a known population parameter. This method
549 * is supplied only to save computation when the mean has already been
550 * computed.</p>
551 * <p>
552 * This formula will not return the same result as the unweighted variance when all
553 * weights are equal, unless all weights are equal to 1. The formula assumes that
554 * weights are to be treated as "expansion values," as will be the case if for example
555 * the weights represent frequency counts. To normalize weights so that the denominator
556 * in the variance computation equals the length of the input vector minus one, use <pre>
557 * <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean); </code>
558 * </pre>
559 * <p>
560 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
561 * <p>
562 * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
563 * <ul><li>the values array is null</li>
564 * <li>the weights array is null</li>
565 * <li>the weights array does not have the same length as the values array</li>
566 * <li>the weights array contains one or more infinite values</li>
567 * <li>the weights array contains one or more NaN values</li>
568 * <li>the weights array contains negative values</li>
569 * </ul></p>
570 * <p>
571 * Does not change the internal state of the statistic.</p>
572 *
573 * @param values the input array
574 * @param weights the weights array
575 * @param mean the precomputed weighted mean value
576 * @return the variance of the values or Double.NaN if length = 0
577 * @throws MathIllegalArgumentException if the parameters are not valid
578 * @since 2.1
579 */
580 public double evaluate(final double[] values, final double[] weights, final double mean)
581 throws MathIllegalArgumentException {
582 return evaluate(values, weights, mean, 0, values.length);
583 }
584
585 /**
586 * @return Returns the isBiasCorrected.
587 */
588 public boolean isBiasCorrected() {
589 return isBiasCorrected;
590 }
591
592 /**
593 * @param biasCorrected The isBiasCorrected to set.
594 */
595 public void setBiasCorrected(boolean biasCorrected) {
596 this.isBiasCorrected = biasCorrected;
597 }
598
599 /**
600 * {@inheritDoc}
601 */
602 @Override
603 public Variance copy() {
604 Variance result = new Variance();
605 // No try-catch or advertised exception because parameters are guaranteed non-null
606 copy(this, result);
607 return result;
608 }
609
610 /**
611 * Copies source to dest.
612 * <p>Neither source nor dest can be null.</p>
613 *
614 * @param source Variance to copy
615 * @param dest Variance to copy to
616 * @throws NullArgumentException if either source or dest is null
617 */
618 public static void copy(Variance source, Variance dest)
619 throws NullArgumentException {
620 MathUtils.checkNotNull(source);
621 MathUtils.checkNotNull(dest);
622 dest.setData(source.getDataRef());
623 dest.moment = source.moment.copy();
624 dest.isBiasCorrected = source.isBiasCorrected;
625 dest.incMoment = source.incMoment;
626 }
627 }