001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.math3.distribution;
019
020 import org.apache.commons.math3.exception.NotStrictlyPositiveException;
021 import org.apache.commons.math3.exception.util.LocalizedFormats;
022 import org.apache.commons.math3.util.FastMath;
023 import org.apache.commons.math3.random.RandomGenerator;
024 import org.apache.commons.math3.random.Well19937c;
025
026 /**
027 * Implementation of the Zipf distribution.
028 *
029 * @see <a href="http://mathworld.wolfram.com/ZipfDistribution.html">Zipf distribution (MathWorld)</a>
030 * @version $Id: ZipfDistribution.java 1416643 2012-12-03 19:37:14Z tn $
031 */
032 public class ZipfDistribution extends AbstractIntegerDistribution {
033 /** Serializable version identifier. */
034 private static final long serialVersionUID = -140627372283420404L;
035 /** Number of elements. */
036 private final int numberOfElements;
037 /** Exponent parameter of the distribution. */
038 private final double exponent;
039 /** Cached numerical mean */
040 private double numericalMean = Double.NaN;
041 /** Whether or not the numerical mean has been calculated */
042 private boolean numericalMeanIsCalculated = false;
043 /** Cached numerical variance */
044 private double numericalVariance = Double.NaN;
045 /** Whether or not the numerical variance has been calculated */
046 private boolean numericalVarianceIsCalculated = false;
047
048 /**
049 * Create a new Zipf distribution with the given number of elements and
050 * exponent.
051 *
052 * @param numberOfElements Number of elements.
053 * @param exponent Exponent.
054 * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
055 * or {@code exponent <= 0}.
056 */
057 public ZipfDistribution(final int numberOfElements, final double exponent) {
058 this(new Well19937c(), numberOfElements, exponent);
059 }
060
061 /**
062 * Creates a Zipf distribution.
063 *
064 * @param rng Random number generator.
065 * @param numberOfElements Number of elements.
066 * @param exponent Exponent.
067 * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
068 * or {@code exponent <= 0}.
069 * @since 3.1
070 */
071 public ZipfDistribution(RandomGenerator rng,
072 int numberOfElements,
073 double exponent)
074 throws NotStrictlyPositiveException {
075 super(rng);
076
077 if (numberOfElements <= 0) {
078 throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION,
079 numberOfElements);
080 }
081 if (exponent <= 0) {
082 throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT,
083 exponent);
084 }
085
086 this.numberOfElements = numberOfElements;
087 this.exponent = exponent;
088 }
089
090 /**
091 * Get the number of elements (e.g. corpus size) for the distribution.
092 *
093 * @return the number of elements
094 */
095 public int getNumberOfElements() {
096 return numberOfElements;
097 }
098
099 /**
100 * Get the exponent characterizing the distribution.
101 *
102 * @return the exponent
103 */
104 public double getExponent() {
105 return exponent;
106 }
107
108 /** {@inheritDoc} */
109 public double probability(final int x) {
110 if (x <= 0 || x > numberOfElements) {
111 return 0.0;
112 }
113
114 return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
115 }
116
117 /** {@inheritDoc} */
118 public double cumulativeProbability(final int x) {
119 if (x <= 0) {
120 return 0.0;
121 } else if (x >= numberOfElements) {
122 return 1.0;
123 }
124
125 return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
126 }
127
128 /**
129 * {@inheritDoc}
130 *
131 * For number of elements {@code N} and exponent {@code s}, the mean is
132 * {@code Hs1 / Hs}, where
133 * <ul>
134 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
135 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
136 * </ul>
137 */
138 public double getNumericalMean() {
139 if (!numericalMeanIsCalculated) {
140 numericalMean = calculateNumericalMean();
141 numericalMeanIsCalculated = true;
142 }
143 return numericalMean;
144 }
145
146 /**
147 * Used by {@link #getNumericalMean()}.
148 *
149 * @return the mean of this distribution
150 */
151 protected double calculateNumericalMean() {
152 final int N = getNumberOfElements();
153 final double s = getExponent();
154
155 final double Hs1 = generalizedHarmonic(N, s - 1);
156 final double Hs = generalizedHarmonic(N, s);
157
158 return Hs1 / Hs;
159 }
160
161 /**
162 * {@inheritDoc}
163 *
164 * For number of elements {@code N} and exponent {@code s}, the mean is
165 * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
166 * <ul>
167 * <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
168 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
169 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
170 * </ul>
171 */
172 public double getNumericalVariance() {
173 if (!numericalVarianceIsCalculated) {
174 numericalVariance = calculateNumericalVariance();
175 numericalVarianceIsCalculated = true;
176 }
177 return numericalVariance;
178 }
179
180 /**
181 * Used by {@link #getNumericalVariance()}.
182 *
183 * @return the variance of this distribution
184 */
185 protected double calculateNumericalVariance() {
186 final int N = getNumberOfElements();
187 final double s = getExponent();
188
189 final double Hs2 = generalizedHarmonic(N, s - 2);
190 final double Hs1 = generalizedHarmonic(N, s - 1);
191 final double Hs = generalizedHarmonic(N, s);
192
193 return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
194 }
195
196 /**
197 * Calculates the Nth generalized harmonic number. See
198 * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
199 * Series</a>.
200 *
201 * @param n Term in the series to calculate (must be larger than 1)
202 * @param m Exponent (special case {@code m = 1} is the harmonic series).
203 * @return the n<sup>th</sup> generalized harmonic number.
204 */
205 private double generalizedHarmonic(final int n, final double m) {
206 double value = 0;
207 for (int k = n; k > 0; --k) {
208 value += 1.0 / FastMath.pow(k, m);
209 }
210 return value;
211 }
212
213 /**
214 * {@inheritDoc}
215 *
216 * The lower bound of the support is always 1 no matter the parameters.
217 *
218 * @return lower bound of the support (always 1)
219 */
220 public int getSupportLowerBound() {
221 return 1;
222 }
223
224 /**
225 * {@inheritDoc}
226 *
227 * The upper bound of the support is the number of elements.
228 *
229 * @return upper bound of the support
230 */
231 public int getSupportUpperBound() {
232 return getNumberOfElements();
233 }
234
235 /**
236 * {@inheritDoc}
237 *
238 * The support of this distribution is connected.
239 *
240 * @return {@code true}
241 */
242 public boolean isSupportConnected() {
243 return true;
244 }
245 }
246