001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.math3.stat.descriptive.rank;
018
019 import java.io.Serializable;
020 import java.util.Arrays;
021
022 import org.apache.commons.math3.exception.MathIllegalArgumentException;
023 import org.apache.commons.math3.exception.NullArgumentException;
024 import org.apache.commons.math3.exception.OutOfRangeException;
025 import org.apache.commons.math3.exception.util.LocalizedFormats;
026 import org.apache.commons.math3.stat.descriptive.AbstractUnivariateStatistic;
027 import org.apache.commons.math3.util.FastMath;
028 import org.apache.commons.math3.util.MathUtils;
029
030 /**
031 * Provides percentile computation.
032 * <p>
033 * There are several commonly used methods for estimating percentiles (a.k.a.
034 * quantiles) based on sample data. For large samples, the different methods
035 * agree closely, but when sample sizes are small, different methods will give
036 * significantly different results. The algorithm implemented here works as follows:
037 * <ol>
038 * <li>Let <code>n</code> be the length of the (sorted) array and
039 * <code>0 < p <= 100</code> be the desired percentile.</li>
040 * <li>If <code> n = 1 </code> return the unique array element (regardless of
041 * the value of <code>p</code>); otherwise </li>
042 * <li>Compute the estimated percentile position
043 * <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code>
044 * between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional
045 * part of <code>pos</code>).</li>
046 * <li> If <code>pos < 1</code> return the smallest element in the array.</li>
047 * <li> Else if <code>pos >= n</code> return the largest element in the array.</li>
048 * <li> Else let <code>lower</code> be the element in position
049 * <code>floor(pos)</code> in the array and let <code>upper</code> be the
050 * next element in the array. Return <code>lower + d * (upper - lower)</code>
051 * </li>
052 * </ol></p>
053 * <p>
054 * To compute percentiles, the data must be at least partially ordered. Input
055 * arrays are copied and recursively partitioned using an ordering definition.
056 * The ordering used by <code>Arrays.sort(double[])</code> is the one determined
057 * by {@link java.lang.Double#compareTo(Double)}. This ordering makes
058 * <code>Double.NaN</code> larger than any other value (including
059 * <code>Double.POSITIVE_INFINITY</code>). Therefore, for example, the median
060 * (50th percentile) of
061 * <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p>
062 * <p>
063 * Since percentile estimation usually involves interpolation between array
064 * elements, arrays containing <code>NaN</code> or infinite values will often
065 * result in <code>NaN</code> or infinite values returned.</p>
066 * <p>
067 * Since 2.2, Percentile uses only selection instead of complete sorting
068 * and caches selection algorithm state between calls to the various
069 * {@code evaluate} methods. This greatly improves efficiency, both for a single
070 * percentile and multiple percentile computations. To maximize performance when
071 * multiple percentiles are computed based on the same data, users should set the
072 * data array once using either one of the {@link #evaluate(double[], double)} or
073 * {@link #setData(double[])} methods and thereafter {@link #evaluate(double)}
074 * with just the percentile provided.
075 * </p>
076 * <p>
077 * <strong>Note that this implementation is not synchronized.</strong> If
078 * multiple threads access an instance of this class concurrently, and at least
079 * one of the threads invokes the <code>increment()</code> or
080 * <code>clear()</code> method, it must be synchronized externally.</p>
081 *
082 * @version $Id: Percentile.java 1416643 2012-12-03 19:37:14Z tn $
083 */
084 public class Percentile extends AbstractUnivariateStatistic implements Serializable {
085
086 /** Serializable version identifier */
087 private static final long serialVersionUID = -8091216485095130416L;
088
089 /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */
090 private static final int MIN_SELECT_SIZE = 15;
091
092 /** Maximum number of partitioning pivots cached (each level double the number of pivots). */
093 private static final int MAX_CACHED_LEVELS = 10;
094
095 /** Determines what percentile is computed when evaluate() is activated
096 * with no quantile argument */
097 private double quantile = 0.0;
098
099 /** Cached pivots. */
100 private int[] cachedPivots;
101
102 /**
103 * Constructs a Percentile with a default quantile
104 * value of 50.0.
105 */
106 public Percentile() {
107 // No try-catch or advertised exception here - arg is valid
108 this(50.0);
109 }
110
111 /**
112 * Constructs a Percentile with the specific quantile value.
113 * @param p the quantile
114 * @throws MathIllegalArgumentException if p is not greater than 0 and less
115 * than or equal to 100
116 */
117 public Percentile(final double p) throws MathIllegalArgumentException {
118 setQuantile(p);
119 cachedPivots = null;
120 }
121
122 /**
123 * Copy constructor, creates a new {@code Percentile} identical
124 * to the {@code original}
125 *
126 * @param original the {@code Percentile} instance to copy
127 * @throws NullArgumentException if original is null
128 */
129 public Percentile(Percentile original) throws NullArgumentException {
130 copy(original, this);
131 }
132
133 /** {@inheritDoc} */
134 @Override
135 public void setData(final double[] values) {
136 if (values == null) {
137 cachedPivots = null;
138 } else {
139 cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
140 Arrays.fill(cachedPivots, -1);
141 }
142 super.setData(values);
143 }
144
145 /** {@inheritDoc} */
146 @Override
147 public void setData(final double[] values, final int begin, final int length)
148 throws MathIllegalArgumentException {
149 if (values == null) {
150 cachedPivots = null;
151 } else {
152 cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
153 Arrays.fill(cachedPivots, -1);
154 }
155 super.setData(values, begin, length);
156 }
157
158 /**
159 * Returns the result of evaluating the statistic over the stored data.
160 * <p>
161 * The stored array is the one which was set by previous calls to
162 * {@link #setData(double[])}
163 * </p>
164 * @param p the percentile value to compute
165 * @return the value of the statistic applied to the stored data
166 * @throws MathIllegalArgumentException if p is not a valid quantile value
167 * (p must be greater than 0 and less than or equal to 100)
168 */
169 public double evaluate(final double p) throws MathIllegalArgumentException {
170 return evaluate(getDataRef(), p);
171 }
172
173 /**
174 * Returns an estimate of the <code>p</code>th percentile of the values
175 * in the <code>values</code> array.
176 * <p>
177 * Calls to this method do not modify the internal <code>quantile</code>
178 * state of this statistic.</p>
179 * <p>
180 * <ul>
181 * <li>Returns <code>Double.NaN</code> if <code>values</code> has length
182 * <code>0</code></li>
183 * <li>Returns (for any value of <code>p</code>) <code>values[0]</code>
184 * if <code>values</code> has length <code>1</code></li>
185 * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code>
186 * is null or p is not a valid quantile value (p must be greater than 0
187 * and less than or equal to 100) </li>
188 * </ul></p>
189 * <p>
190 * See {@link Percentile} for a description of the percentile estimation
191 * algorithm used.</p>
192 *
193 * @param values input array of values
194 * @param p the percentile value to compute
195 * @return the percentile value or Double.NaN if the array is empty
196 * @throws MathIllegalArgumentException if <code>values</code> is null
197 * or p is invalid
198 */
199 public double evaluate(final double[] values, final double p)
200 throws MathIllegalArgumentException {
201 test(values, 0, 0);
202 return evaluate(values, 0, values.length, p);
203 }
204
205 /**
206 * Returns an estimate of the <code>quantile</code>th percentile of the
207 * designated values in the <code>values</code> array. The quantile
208 * estimated is determined by the <code>quantile</code> property.
209 * <p>
210 * <ul>
211 * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
212 * <li>Returns (for any value of <code>quantile</code>)
213 * <code>values[begin]</code> if <code>length = 1 </code></li>
214 * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code>
215 * is null, or <code>start</code> or <code>length</code> is invalid</li>
216 * </ul></p>
217 * <p>
218 * See {@link Percentile} for a description of the percentile estimation
219 * algorithm used.</p>
220 *
221 * @param values the input array
222 * @param start index of the first array element to include
223 * @param length the number of elements to include
224 * @return the percentile value
225 * @throws MathIllegalArgumentException if the parameters are not valid
226 *
227 */
228 @Override
229 public double evaluate(final double[] values, final int start, final int length)
230 throws MathIllegalArgumentException {
231 return evaluate(values, start, length, quantile);
232 }
233
234 /**
235 * Returns an estimate of the <code>p</code>th percentile of the values
236 * in the <code>values</code> array, starting with the element in (0-based)
237 * position <code>begin</code> in the array and including <code>length</code>
238 * values.
239 * <p>
240 * Calls to this method do not modify the internal <code>quantile</code>
241 * state of this statistic.</p>
242 * <p>
243 * <ul>
244 * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
245 * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code>
246 * if <code>length = 1 </code></li>
247 * <li>Throws <code>MathIllegalArgumentException</code> if <code>values</code>
248 * is null , <code>begin</code> or <code>length</code> is invalid, or
249 * <code>p</code> is not a valid quantile value (p must be greater than 0
250 * and less than or equal to 100)</li>
251 * </ul></p>
252 * <p>
253 * See {@link Percentile} for a description of the percentile estimation
254 * algorithm used.</p>
255 *
256 * @param values array of input values
257 * @param p the percentile to compute
258 * @param begin the first (0-based) element to include in the computation
259 * @param length the number of array elements to include
260 * @return the percentile value
261 * @throws MathIllegalArgumentException if the parameters are not valid or the
262 * input array is null
263 */
264 public double evaluate(final double[] values, final int begin,
265 final int length, final double p) throws MathIllegalArgumentException {
266
267 test(values, begin, length);
268
269 if ((p > 100) || (p <= 0)) {
270 throw new OutOfRangeException(
271 LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100);
272 }
273 if (length == 0) {
274 return Double.NaN;
275 }
276 if (length == 1) {
277 return values[begin]; // always return single value for n = 1
278 }
279 double n = length;
280 double pos = p * (n + 1) / 100;
281 double fpos = FastMath.floor(pos);
282 int intPos = (int) fpos;
283 double dif = pos - fpos;
284 double[] work;
285 int[] pivotsHeap;
286 if (values == getDataRef()) {
287 work = getDataRef();
288 pivotsHeap = cachedPivots;
289 } else {
290 work = new double[length];
291 System.arraycopy(values, begin, work, 0, length);
292 pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
293 Arrays.fill(pivotsHeap, -1);
294 }
295
296 if (pos < 1) {
297 return select(work, pivotsHeap, 0);
298 }
299 if (pos >= n) {
300 return select(work, pivotsHeap, length - 1);
301 }
302 double lower = select(work, pivotsHeap, intPos - 1);
303 double upper = select(work, pivotsHeap, intPos);
304 return lower + dif * (upper - lower);
305 }
306
307 /**
308 * Select the k<sup>th</sup> smallest element from work array
309 * @param work work array (will be reorganized during the call)
310 * @param pivotsHeap set of pivot index corresponding to elements that
311 * are already at their sorted location, stored as an implicit heap
312 * (i.e. a sorted binary tree stored in a flat array, where the
313 * children of a node at index n are at indices 2n+1 for the left
314 * child and 2n+2 for the right child, with 0-based indices)
315 * @param k index of the desired element
316 * @return k<sup>th</sup> smallest element
317 */
318 private double select(final double[] work, final int[] pivotsHeap, final int k) {
319
320 int begin = 0;
321 int end = work.length;
322 int node = 0;
323
324 while (end - begin > MIN_SELECT_SIZE) {
325
326 final int pivot;
327 if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) {
328 // the pivot has already been found in a previous call
329 // and the array has already been partitioned around it
330 pivot = pivotsHeap[node];
331 } else {
332 // select a pivot and partition work array around it
333 pivot = partition(work, begin, end, medianOf3(work, begin, end));
334 if (node < pivotsHeap.length) {
335 pivotsHeap[node] = pivot;
336 }
337 }
338
339 if (k == pivot) {
340 // the pivot was exactly the element we wanted
341 return work[k];
342 } else if (k < pivot) {
343 // the element is in the left partition
344 end = pivot;
345 node = FastMath.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow
346 } else {
347 // the element is in the right partition
348 begin = pivot + 1;
349 node = FastMath.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow
350 }
351
352 }
353
354 // the element is somewhere in the small sub-array
355 // sort the sub-array using insertion sort
356 insertionSort(work, begin, end);
357 return work[k];
358
359 }
360
361 /** Select a pivot index as the median of three
362 * @param work data array
363 * @param begin index of the first element of the slice
364 * @param end index after the last element of the slice
365 * @return the index of the median element chosen between the
366 * first, the middle and the last element of the array slice
367 */
368 int medianOf3(final double[] work, final int begin, final int end) {
369
370 final int inclusiveEnd = end - 1;
371 final int middle = begin + (inclusiveEnd - begin) / 2;
372 final double wBegin = work[begin];
373 final double wMiddle = work[middle];
374 final double wEnd = work[inclusiveEnd];
375
376 if (wBegin < wMiddle) {
377 if (wMiddle < wEnd) {
378 return middle;
379 } else {
380 return (wBegin < wEnd) ? inclusiveEnd : begin;
381 }
382 } else {
383 if (wBegin < wEnd) {
384 return begin;
385 } else {
386 return (wMiddle < wEnd) ? inclusiveEnd : middle;
387 }
388 }
389
390 }
391
392 /**
393 * Partition an array slice around a pivot
394 * <p>
395 * Partitioning exchanges array elements such that all elements
396 * smaller than pivot are before it and all elements larger than
397 * pivot are after it
398 * </p>
399 * @param work data array
400 * @param begin index of the first element of the slice
401 * @param end index after the last element of the slice
402 * @param pivot initial index of the pivot
403 * @return index of the pivot after partition
404 */
405 private int partition(final double[] work, final int begin, final int end, final int pivot) {
406
407 final double value = work[pivot];
408 work[pivot] = work[begin];
409
410 int i = begin + 1;
411 int j = end - 1;
412 while (i < j) {
413 while ((i < j) && (work[j] > value)) {
414 --j;
415 }
416 while ((i < j) && (work[i] < value)) {
417 ++i;
418 }
419
420 if (i < j) {
421 final double tmp = work[i];
422 work[i++] = work[j];
423 work[j--] = tmp;
424 }
425 }
426
427 if ((i >= end) || (work[i] > value)) {
428 --i;
429 }
430 work[begin] = work[i];
431 work[i] = value;
432 return i;
433
434 }
435
436 /**
437 * Sort in place a (small) array slice using insertion sort
438 * @param work array to sort
439 * @param begin index of the first element of the slice to sort
440 * @param end index after the last element of the slice to sort
441 */
442 private void insertionSort(final double[] work, final int begin, final int end) {
443 for (int j = begin + 1; j < end; j++) {
444 final double saved = work[j];
445 int i = j - 1;
446 while ((i >= begin) && (saved < work[i])) {
447 work[i + 1] = work[i];
448 i--;
449 }
450 work[i + 1] = saved;
451 }
452 }
453
454 /**
455 * Returns the value of the quantile field (determines what percentile is
456 * computed when evaluate() is called with no quantile argument).
457 *
458 * @return quantile
459 */
460 public double getQuantile() {
461 return quantile;
462 }
463
464 /**
465 * Sets the value of the quantile field (determines what percentile is
466 * computed when evaluate() is called with no quantile argument).
467 *
468 * @param p a value between 0 < p <= 100
469 * @throws MathIllegalArgumentException if p is not greater than 0 and less
470 * than or equal to 100
471 */
472 public void setQuantile(final double p) throws MathIllegalArgumentException {
473 if (p <= 0 || p > 100) {
474 throw new OutOfRangeException(
475 LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100);
476 }
477 quantile = p;
478 }
479
480 /**
481 * {@inheritDoc}
482 */
483 @Override
484 public Percentile copy() {
485 Percentile result = new Percentile();
486 //No try-catch or advertised exception because args are guaranteed non-null
487 copy(this, result);
488 return result;
489 }
490
491 /**
492 * Copies source to dest.
493 * <p>Neither source nor dest can be null.</p>
494 *
495 * @param source Percentile to copy
496 * @param dest Percentile to copy to
497 * @throws NullArgumentException if either source or dest is null
498 */
499 public static void copy(Percentile source, Percentile dest)
500 throws NullArgumentException {
501 MathUtils.checkNotNull(source);
502 MathUtils.checkNotNull(dest);
503 dest.setData(source.getDataRef());
504 if (source.cachedPivots != null) {
505 System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length);
506 }
507 dest.quantile = source.quantile;
508 }
509
510 }