001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.math3.distribution;
019
020 import java.io.Serializable;
021 import java.math.BigDecimal;
022
023 import org.apache.commons.math3.exception.MathArithmeticException;
024 import org.apache.commons.math3.exception.NotStrictlyPositiveException;
025 import org.apache.commons.math3.exception.NumberIsTooLargeException;
026 import org.apache.commons.math3.exception.util.LocalizedFormats;
027 import org.apache.commons.math3.fraction.BigFraction;
028 import org.apache.commons.math3.fraction.BigFractionField;
029 import org.apache.commons.math3.fraction.FractionConversionException;
030 import org.apache.commons.math3.linear.Array2DRowFieldMatrix;
031 import org.apache.commons.math3.linear.Array2DRowRealMatrix;
032 import org.apache.commons.math3.linear.FieldMatrix;
033 import org.apache.commons.math3.linear.RealMatrix;
034
035 /**
036 * Implementation of the Kolmogorov-Smirnov distribution.
037 *
038 * <p>
039 * Treats the distribution of the two-sided {@code P(D_n < d)} where
040 * {@code D_n = sup_x |G(x) - G_n (x)|} for the theoretical cdf {@code G} and
041 * the empirical cdf {@code G_n}.
042 * </p>
043 * <p>
044 * This implementation is based on [1] with certain quick decisions for extreme
045 * values given in [2].
046 * </p>
047 * <p>
048 * In short, when wanting to evaluate {@code P(D_n < d)}, the method in [1] is
049 * to write {@code d = (k - h) / n} for positive integer {@code k} and
050 * {@code 0 <= h < 1}. Then {@code P(D_n < d) = (n! / n^n) * t_kk}, where
051 * {@code t_kk} is the {@code (k, k)}'th entry in the special matrix
052 * {@code H^n}, i.e. {@code H} to the {@code n}'th power.
053 * </p>
054 * <p>
055 * References:
056 * <ul>
057 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/">
058 * Evaluating Kolmogorov's Distribution</a> by George Marsaglia, Wai
059 * Wan Tsang, and Jingbo Wang</li>
060 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/">
061 * Computing the Two-Sided Kolmogorov-Smirnov Distribution</a> by Richard Simard
062 * and Pierre L'Ecuyer</li>
063 * </ul>
064 * Note that [1] contains an error in computing h, refer to
065 * <a href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
066 * </p>
067 *
068 * @see <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
069 * Kolmogorov-Smirnov test (Wikipedia)</a>
070 * @version $Id: KolmogorovSmirnovDistribution.java 1416643 2012-12-03 19:37:14Z tn $
071 */
072 public class KolmogorovSmirnovDistribution implements Serializable {
073
074 /** Serializable version identifier. */
075 private static final long serialVersionUID = -4670676796862967187L;
076
077 /** Number of observations. */
078 private int n;
079
080 /**
081 * @param n Number of observations
082 * @throws NotStrictlyPositiveException if {@code n <= 0}
083 */
084 public KolmogorovSmirnovDistribution(int n)
085 throws NotStrictlyPositiveException {
086 if (n <= 0) {
087 throw new NotStrictlyPositiveException(LocalizedFormats.NOT_POSITIVE_NUMBER_OF_SAMPLES, n);
088 }
089
090 this.n = n;
091 }
092
093 /**
094 * Calculates {@code P(D_n < d)} using method described in [1] with quick
095 * decisions for extreme values given in [2] (see above). The result is not
096 * exact as with
097 * {@link KolmogorovSmirnovDistribution#cdfExact(double)} because
098 * calculations are based on {@code double} rather than
099 * {@link org.apache.commons.math3.fraction.BigFraction}.
100 *
101 * @param d statistic
102 * @return the two-sided probability of {@code P(D_n < d)}
103 * @throws MathArithmeticException if algorithm fails to convert {@code h}
104 * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
105 * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
106 * {@code 0 <= h < 1}.
107 */
108 public double cdf(double d) throws MathArithmeticException {
109 return this.cdf(d, false);
110 }
111
112 /**
113 * Calculates {@code P(D_n < d)} using method described in [1] with quick
114 * decisions for extreme values given in [2] (see above). The result is
115 * exact in the sense that BigFraction/BigReal is used everywhere at the
116 * expense of very slow execution time. Almost never choose this in real
117 * applications unless you are very sure; this is almost solely for
118 * verification purposes. Normally, you would choose
119 * {@link KolmogorovSmirnovDistribution#cdf(double)}
120 *
121 * @param d statistic
122 * @return the two-sided probability of {@code P(D_n < d)}
123 * @throws MathArithmeticException if algorithm fails to convert {@code h}
124 * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
125 * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
126 * {@code 0 <= h < 1}.
127 */
128 public double cdfExact(double d) throws MathArithmeticException {
129 return this.cdf(d, true);
130 }
131
132 /**
133 * Calculates {@code P(D_n < d)} using method described in [1] with quick
134 * decisions for extreme values given in [2] (see above).
135 *
136 * @param d statistic
137 * @param exact whether the probability should be calculated exact using
138 * {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the
139 * expense of very slow execution time, or if {@code double} should be used
140 * convenient places to gain speed. Almost never choose {@code true} in real
141 * applications unless you are very sure; {@code true} is almost solely for
142 * verification purposes.
143 * @return the two-sided probability of {@code P(D_n < d)}
144 * @throws MathArithmeticException if algorithm fails to convert {@code h}
145 * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
146 * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
147 * {@code 0 <= h < 1}.
148 */
149 public double cdf(double d, boolean exact) throws MathArithmeticException {
150
151 final double ninv = 1 / ((double) n);
152 final double ninvhalf = 0.5 * ninv;
153
154 if (d <= ninvhalf) {
155
156 return 0;
157
158 } else if (ninvhalf < d && d <= ninv) {
159
160 double res = 1;
161 double f = 2 * d - ninv;
162
163 // n! f^n = n*f * (n-1)*f * ... * 1*x
164 for (int i = 1; i <= n; ++i) {
165 res *= i * f;
166 }
167
168 return res;
169
170 } else if (1 - ninv <= d && d < 1) {
171
172 return 1 - 2 * Math.pow(1 - d, n);
173
174 } else if (1 <= d) {
175
176 return 1;
177 }
178
179 return exact ? exactK(d) : roundedK(d);
180 }
181
182 /**
183 * Calculates the exact value of {@code P(D_n < d)} using method described
184 * in [1] and {@link org.apache.commons.math3.fraction.BigFraction} (see
185 * above).
186 *
187 * @param d statistic
188 * @return the two-sided probability of {@code P(D_n < d)}
189 * @throws MathArithmeticException if algorithm fails to convert {@code h}
190 * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
191 * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
192 * {@code 0 <= h < 1}.
193 */
194 private double exactK(double d) throws MathArithmeticException {
195
196 final int k = (int) Math.ceil(n * d);
197
198 final FieldMatrix<BigFraction> H = this.createH(d);
199 final FieldMatrix<BigFraction> Hpower = H.power(n);
200
201 BigFraction pFrac = Hpower.getEntry(k - 1, k - 1);
202
203 for (int i = 1; i <= n; ++i) {
204 pFrac = pFrac.multiply(i).divide(n);
205 }
206
207 /*
208 * BigFraction.doubleValue converts numerator to double and the
209 * denominator to double and divides afterwards. That gives NaN quite
210 * easy. This does not (scale is the number of digits):
211 */
212 return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue();
213 }
214
215 /**
216 * Calculates {@code P(D_n < d)} using method described in [1] and doubles
217 * (see above).
218 *
219 * @param d statistic
220 * @return the two-sided probability of {@code P(D_n < d)}
221 * @throws MathArithmeticException if algorithm fails to convert {@code h}
222 * to a {@link org.apache.commons.math3.fraction.BigFraction} in expressing
223 * {@code d} as {@code (k - h) / m} for integer {@code k, m} and
224 * {@code 0 <= h < 1}.
225 */
226 private double roundedK(double d) throws MathArithmeticException {
227
228 final int k = (int) Math.ceil(n * d);
229 final FieldMatrix<BigFraction> HBigFraction = this.createH(d);
230 final int m = HBigFraction.getRowDimension();
231
232 /*
233 * Here the rounding part comes into play: use
234 * RealMatrix instead of FieldMatrix<BigFraction>
235 */
236 final RealMatrix H = new Array2DRowRealMatrix(m, m);
237
238 for (int i = 0; i < m; ++i) {
239 for (int j = 0; j < m; ++j) {
240 H.setEntry(i, j, HBigFraction.getEntry(i, j).doubleValue());
241 }
242 }
243
244 final RealMatrix Hpower = H.power(n);
245
246 double pFrac = Hpower.getEntry(k - 1, k - 1);
247
248 for (int i = 1; i <= n; ++i) {
249 pFrac *= (double) i / (double) n;
250 }
251
252 return pFrac;
253 }
254
255 /***
256 * Creates {@code H} of size {@code m x m} as described in [1] (see above).
257 *
258 * @param d statistic
259 * @return H matrix
260 * @throws NumberIsTooLargeException if fractional part is greater than 1
261 * @throws FractionConversionException if algorithm fails to convert
262 * {@code h} to a {@link org.apache.commons.math3.fraction.BigFraction} in
263 * expressing {@code d} as {@code (k - h) / m} for integer {@code k, m} and
264 * {@code 0 <= h < 1}.
265 */
266 private FieldMatrix<BigFraction> createH(double d)
267 throws NumberIsTooLargeException, FractionConversionException {
268
269 int k = (int) Math.ceil(n * d);
270
271 int m = 2 * k - 1;
272 double hDouble = k - n * d;
273
274 if (hDouble >= 1) {
275 throw new NumberIsTooLargeException(hDouble, 1.0, false);
276 }
277
278 BigFraction h = null;
279
280 try {
281 h = new BigFraction(hDouble, 1.0e-20, 10000);
282 } catch (FractionConversionException e1) {
283 try {
284 h = new BigFraction(hDouble, 1.0e-10, 10000);
285 } catch (FractionConversionException e2) {
286 h = new BigFraction(hDouble, 1.0e-5, 10000);
287 }
288 }
289
290 final BigFraction[][] Hdata = new BigFraction[m][m];
291
292 /*
293 * Start by filling everything with either 0 or 1.
294 */
295 for (int i = 0; i < m; ++i) {
296 for (int j = 0; j < m; ++j) {
297 if (i - j + 1 < 0) {
298 Hdata[i][j] = BigFraction.ZERO;
299 } else {
300 Hdata[i][j] = BigFraction.ONE;
301 }
302 }
303 }
304
305 /*
306 * Setting up power-array to avoid calculating the same value twice:
307 * hPowers[0] = h^1 ... hPowers[m-1] = h^m
308 */
309 final BigFraction[] hPowers = new BigFraction[m];
310 hPowers[0] = h;
311 for (int i = 1; i < m; ++i) {
312 hPowers[i] = h.multiply(hPowers[i - 1]);
313 }
314
315 /*
316 * First column and last row has special values (each other reversed).
317 */
318 for (int i = 0; i < m; ++i) {
319 Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]);
320 Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]);
321 }
322
323 /*
324 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix
325 * should be (1 - 2*h^m + (2h - 1)^m )/m!" Since 0 <= h < 1, then if h >
326 * 1/2 is sufficient to check:
327 */
328 if (h.compareTo(BigFraction.ONE_HALF) == 1) {
329 Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m));
330 }
331
332 /*
333 * Aside from the first column and last row, the (i, j)-th element is
334 * 1/(i - j + 1)! if i - j + 1 >= 0, else 0. 1's and 0's are already
335 * put, so only division with (i - j + 1)! is needed in the elements
336 * that have 1's. There is no need to calculate (i - j + 1)! and then
337 * divide - small steps avoid overflows.
338 *
339 * Note that i - j + 1 > 0 <=> i + 1 > j instead of j'ing all the way to
340 * m. Also note that it is started at g = 2 because dividing by 1 isn't
341 * really necessary.
342 */
343 for (int i = 0; i < m; ++i) {
344 for (int j = 0; j < i + 1; ++j) {
345 if (i - j + 1 > 0) {
346 for (int g = 2; g <= i - j + 1; ++g) {
347 Hdata[i][j] = Hdata[i][j].divide(g);
348 }
349 }
350 }
351 }
352
353 return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata);
354 }
355 }