001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.math3.stat.inference;
018
019 import org.apache.commons.math3.distribution.FDistribution;
020 import org.apache.commons.math3.exception.ConvergenceException;
021 import org.apache.commons.math3.exception.DimensionMismatchException;
022 import org.apache.commons.math3.exception.MaxCountExceededException;
023 import org.apache.commons.math3.exception.NullArgumentException;
024 import org.apache.commons.math3.exception.OutOfRangeException;
025 import org.apache.commons.math3.exception.util.LocalizedFormats;
026 import org.apache.commons.math3.stat.descriptive.summary.Sum;
027 import org.apache.commons.math3.stat.descriptive.summary.SumOfSquares;
028
029 import java.util.Collection;
030
031 /**
032 * Implements one-way ANOVA (analysis of variance) statistics.
033 *
034 * <p> Tests for differences between two or more categories of univariate data
035 * (for example, the body mass index of accountants, lawyers, doctors and
036 * computer programmers). When two categories are given, this is equivalent to
037 * the {@link org.apache.commons.math3.stat.inference.TTest}.
038 * </p><p>
039 * Uses the {@link org.apache.commons.math3.distribution.FDistribution
040 * commons-math F Distribution implementation} to estimate exact p-values.</p>
041 * <p>This implementation is based on a description at
042 * http://faculty.vassar.edu/lowry/ch13pt1.html</p>
043 * <pre>
044 * Abbreviations: bg = between groups,
045 * wg = within groups,
046 * ss = sum squared deviations
047 * </pre>
048 *
049 * @since 1.2
050 * @version $Id: OneWayAnova.java 1416643 2012-12-03 19:37:14Z tn $
051 */
052 public class OneWayAnova {
053
054 /**
055 * Default constructor.
056 */
057 public OneWayAnova() {
058 }
059
060 /**
061 * Computes the ANOVA F-value for a collection of <code>double[]</code>
062 * arrays.
063 *
064 * <p><strong>Preconditions</strong>: <ul>
065 * <li>The categoryData <code>Collection</code> must contain
066 * <code>double[]</code> arrays.</li>
067 * <li> There must be at least two <code>double[]</code> arrays in the
068 * <code>categoryData</code> collection and each of these arrays must
069 * contain at least two values.</li></ul></p><p>
070 * This implementation computes the F statistic using the definitional
071 * formula<pre>
072 * F = msbg/mswg</pre>
073 * where<pre>
074 * msbg = between group mean square
075 * mswg = within group mean square</pre>
076 * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
077 * here</a></p>
078 *
079 * @param categoryData <code>Collection</code> of <code>double[]</code>
080 * arrays each containing data for one category
081 * @return Fvalue
082 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
083 * @throws DimensionMismatchException if the length of the <code>categoryData</code>
084 * array is less than 2 or a contained <code>double[]</code> array does not have
085 * at least two values
086 */
087 public double anovaFValue(final Collection<double[]> categoryData)
088 throws NullArgumentException, DimensionMismatchException {
089
090 AnovaStats a = anovaStats(categoryData);
091 return a.F;
092
093 }
094
095 /**
096 * Computes the ANOVA P-value for a collection of <code>double[]</code>
097 * arrays.
098 *
099 * <p><strong>Preconditions</strong>: <ul>
100 * <li>The categoryData <code>Collection</code> must contain
101 * <code>double[]</code> arrays.</li>
102 * <li> There must be at least two <code>double[]</code> arrays in the
103 * <code>categoryData</code> collection and each of these arrays must
104 * contain at least two values.</li></ul></p><p>
105 * This implementation uses the
106 * {@link org.apache.commons.math3.distribution.FDistribution
107 * commons-math F Distribution implementation} to estimate the exact
108 * p-value, using the formula<pre>
109 * p = 1 - cumulativeProbability(F)</pre>
110 * where <code>F</code> is the F value and <code>cumulativeProbability</code>
111 * is the commons-math implementation of the F distribution.</p>
112 *
113 * @param categoryData <code>Collection</code> of <code>double[]</code>
114 * arrays each containing data for one category
115 * @return Pvalue
116 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
117 * @throws DimensionMismatchException if the length of the <code>categoryData</code>
118 * array is less than 2 or a contained <code>double[]</code> array does not have
119 * at least two values
120 * @throws ConvergenceException if the p-value can not be computed due to a convergence error
121 * @throws MaxCountExceededException if the maximum number of iterations is exceeded
122 */
123 public double anovaPValue(final Collection<double[]> categoryData)
124 throws NullArgumentException, DimensionMismatchException,
125 ConvergenceException, MaxCountExceededException {
126
127 AnovaStats a = anovaStats(categoryData);
128 // No try-catch or advertised exception because args are valid
129 FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
130 return 1.0 - fdist.cumulativeProbability(a.F);
131
132 }
133
134 /**
135 * Performs an ANOVA test, evaluating the null hypothesis that there
136 * is no difference among the means of the data categories.
137 *
138 * <p><strong>Preconditions</strong>: <ul>
139 * <li>The categoryData <code>Collection</code> must contain
140 * <code>double[]</code> arrays.</li>
141 * <li> There must be at least two <code>double[]</code> arrays in the
142 * <code>categoryData</code> collection and each of these arrays must
143 * contain at least two values.</li>
144 * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
145 * </li></ul></p><p>
146 * This implementation uses the
147 * {@link org.apache.commons.math3.distribution.FDistribution
148 * commons-math F Distribution implementation} to estimate the exact
149 * p-value, using the formula<pre>
150 * p = 1 - cumulativeProbability(F)</pre>
151 * where <code>F</code> is the F value and <code>cumulativeProbability</code>
152 * is the commons-math implementation of the F distribution.</p>
153 * <p>True is returned iff the estimated p-value is less than alpha.</p>
154 *
155 * @param categoryData <code>Collection</code> of <code>double[]</code>
156 * arrays each containing data for one category
157 * @param alpha significance level of the test
158 * @return true if the null hypothesis can be rejected with
159 * confidence 1 - alpha
160 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
161 * @throws DimensionMismatchException if the length of the <code>categoryData</code>
162 * array is less than 2 or a contained <code>double[]</code> array does not have
163 * at least two values
164 * @throws OutOfRangeException if <code>alpha</code> is not in the range (0, 0.5]
165 * @throws ConvergenceException if the p-value can not be computed due to a convergence error
166 * @throws MaxCountExceededException if the maximum number of iterations is exceeded
167 */
168 public boolean anovaTest(final Collection<double[]> categoryData,
169 final double alpha)
170 throws NullArgumentException, DimensionMismatchException,
171 OutOfRangeException, ConvergenceException, MaxCountExceededException {
172
173 if ((alpha <= 0) || (alpha > 0.5)) {
174 throw new OutOfRangeException(
175 LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
176 alpha, 0, 0.5);
177 }
178 return anovaPValue(categoryData) < alpha;
179
180 }
181
182 /**
183 * This method actually does the calculations (except P-value).
184 *
185 * @param categoryData <code>Collection</code> of <code>double[]</code>
186 * arrays each containing data for one category
187 * @return computed AnovaStats
188 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
189 * @throws DimensionMismatchException if the length of the <code>categoryData</code>
190 * array is less than 2 or a contained <code>double[]</code> array does not contain
191 * at least two values
192 */
193 private AnovaStats anovaStats(final Collection<double[]> categoryData)
194 throws NullArgumentException, DimensionMismatchException {
195
196 if (categoryData == null) {
197 throw new NullArgumentException();
198 }
199
200 // check if we have enough categories
201 if (categoryData.size() < 2) {
202 throw new DimensionMismatchException(
203 LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
204 categoryData.size(), 2);
205 }
206
207 // check if each category has enough data and all is double[]
208 for (double[] array : categoryData) {
209 if (array.length <= 1) {
210 throw new DimensionMismatchException(
211 LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
212 array.length, 2);
213 }
214 }
215
216 int dfwg = 0;
217 double sswg = 0;
218 Sum totsum = new Sum();
219 SumOfSquares totsumsq = new SumOfSquares();
220 int totnum = 0;
221
222 for (double[] data : categoryData) {
223
224 Sum sum = new Sum();
225 SumOfSquares sumsq = new SumOfSquares();
226 int num = 0;
227
228 for (int i = 0; i < data.length; i++) {
229 double val = data[i];
230
231 // within category
232 num++;
233 sum.increment(val);
234 sumsq.increment(val);
235
236 // for all categories
237 totnum++;
238 totsum.increment(val);
239 totsumsq.increment(val);
240 }
241 dfwg += num - 1;
242 double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num;
243 sswg += ss;
244 }
245 double sst = totsumsq.getResult() - totsum.getResult() *
246 totsum.getResult()/totnum;
247 double ssbg = sst - sswg;
248 int dfbg = categoryData.size() - 1;
249 double msbg = ssbg/dfbg;
250 double mswg = sswg/dfwg;
251 double F = msbg/mswg;
252
253 return new AnovaStats(dfbg, dfwg, F);
254 }
255
256 /**
257 Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
258 No get/set methods provided.
259 */
260 private static class AnovaStats {
261
262 /** Degrees of freedom in numerator (between groups). */
263 private final int dfbg;
264
265 /** Degrees of freedom in denominator (within groups). */
266 private final int dfwg;
267
268 /** Statistic. */
269 private final double F;
270
271 /**
272 * Constructor
273 * @param dfbg degrees of freedom in numerator (between groups)
274 * @param dfwg degrees of freedom in denominator (within groups)
275 * @param F statistic
276 */
277 private AnovaStats(int dfbg, int dfwg, double F) {
278 this.dfbg = dfbg;
279 this.dfwg = dfwg;
280 this.F = F;
281 }
282 }
283
284 }