View Javadoc
1   /*
2    * Copyright (c) 2003, The JUNG Authors 
3    *
4    * All rights reserved.
5    *
6    * This software is open-source under the BSD license; see either
7    * "license.txt" or
8    * https://github.com/jrtom/jung/blob/master/LICENSE for a description.
9    * 
10   * Created on Feb 18, 2004
11   */
12  package edu.uci.ics.jung.algorithms.util;
13  
14  import java.util.Collection;
15  import java.util.Iterator;
16  
17  import com.google.common.base.Preconditions;
18  
19  /**
20   * A utility class for calculating properties of discrete distributions.
21   * Generally, these distributions are represented as arrays of 
22   * <code>double</code> values, which are assumed to be normalized
23   * such that the entries in a single array sum to 1.  
24   * 
25   * @author Joshua O'Madadhain
26   */
27  public class DiscreteDistribution
28  {
29  
30      /**
31       * Returns the Kullback-Leibler divergence between the 
32       * two specified distributions, which must have the same
33       * number of elements.  This is defined as 
34       * the sum over all <code>i</code> of 
35       * <code>dist[i] * Math.log(dist[i] / reference[i])</code>.
36       * Note that this value is not symmetric; see 
37       * <code>symmetricKL</code> for a symmetric variant. 
38       * @see #symmetricKL(double[], double[])
39       * @param dist the distribution whose divergence from {@code reference} is being measured
40       * @param reference the reference distribution
41       * @return sum_i of {@code dist[i] * Math.log(dist[i] / reference[i])}
42       */
43      public static double KullbackLeibler(double[] dist, double[] reference)
44      {
45          double distance = 0;
46  
47          Preconditions.checkArgument(dist.length == reference.length,
48          		"input arrays must be of the same length");
49  
50          for (int i = 0; i < dist.length; i++)
51          {
52              if (dist[i] > 0 && reference[i] > 0)
53                  distance += dist[i] * Math.log(dist[i] / reference[i]);
54          }
55          return distance;
56      }
57  
58      /**
59       * @param dist the distribution whose divergence from {@code reference} is being measured
60       * @param reference the reference distribution
61       * @return <code>KullbackLeibler(dist, reference) + KullbackLeibler(reference, dist)</code>
62       * @see #KullbackLeibler(double[], double[])
63       */
64      public static double symmetricKL(double[] dist, double[] reference)
65      {
66          return KullbackLeibler(dist, reference)
67                  + KullbackLeibler(reference, dist);
68      }
69  
70      /**
71       * Returns the squared difference between the 
72       * two specified distributions, which must have the same
73       * number of elements.  This is defined as 
74       * the sum over all <code>i</code> of the square of 
75       * <code>(dist[i] - reference[i])</code>.
76       * @param dist the distribution whose distance from {@code reference} is being measured
77       * @param reference the reference distribution
78       * @return sum_i {@code (dist[i] - reference[i])^2}
79       */
80      public static double squaredError(double[] dist, double[] reference)
81      {
82          double error = 0;
83  
84          Preconditions.checkArgument(dist.length == reference.length,
85          		"input arrays must be of the same length");
86  
87          for (int i = 0; i < dist.length; i++)
88          {
89              double difference = dist[i] - reference[i];
90              error += difference * difference;
91          }
92          return error;
93      }
94  
95      /**
96       * Returns the cosine distance between the two 
97       * specified distributions, which must have the same number
98       * of elements.  The distributions are treated as vectors
99       * in <code>dist.length</code>-dimensional space.
100      * Given the following definitions
101      * <ul>
102      * <li><code>v</code> = the sum over all <code>i</code> of <code>dist[i] * dist[i]</code>
103      * <li><code>w</code> = the sum over all <code>i</code> of <code>reference[i] * reference[i]</code>
104      * <li><code>vw</code> = the sum over all <code>i</code> of <code>dist[i] * reference[i]</code>
105      * </ul>
106      * the value returned is defined as <code>vw / (Math.sqrt(v) * Math.sqrt(w))</code>.
107      * @param dist the distribution whose distance from {@code reference} is being measured
108      * @param reference the reference distribution
109      * @return the cosine distance between {@code dist} and {@code reference}, considered as vectors
110      */
111     public static double cosine(double[] dist, double[] reference)
112     {
113         double v_prod = 0; // dot product x*x
114         double w_prod = 0; // dot product y*y
115         double vw_prod = 0; // dot product x*y
116 
117         Preconditions.checkArgument(dist.length == reference.length,
118         		"input arrays must be of the same length");
119 
120         for (int i = 0; i < dist.length; i++)
121         {
122             vw_prod += dist[i] * reference[i];
123             v_prod += dist[i] * dist[i];
124             w_prod += reference[i] * reference[i];
125         }
126         // cosine distance between v and w
127         return vw_prod / (Math.sqrt(v_prod) * Math.sqrt(w_prod));
128     }
129 
130     /**
131      * Returns the entropy of this distribution.
132      * High entropy indicates that the distribution is 
133      * close to uniform; low entropy indicates that the
134      * distribution is close to a Dirac delta (i.e., if
135      * the probability mass is concentrated at a single
136      * point, this method returns 0).  Entropy is defined as 
137      * the sum over all <code>i</code> of 
138      * <code>-(dist[i] * Math.log(dist[i]))</code>
139      * 
140      * @param dist the distribution whose entropy is being measured
141      * @return sum_i {@code -(dist[i] * Math.log(dist[i]))}
142      */
143     public static double entropy(double[] dist)
144     {
145         double total = 0;
146 
147         for (int i = 0; i < dist.length; i++)
148         {
149             if (dist[i] > 0)
150                 total += dist[i] * Math.log(dist[i]);
151         }
152         return -total;
153     }
154 
155     /**
156      * Normalizes, with Lagrangian smoothing, the specified <code>double</code>
157      * array, so that the values sum to 1 (i.e., can be treated as probabilities).
158      * The effect of the Lagrangian smoothing is to ensure that all entries 
159      * are nonzero; effectively, a value of <code>alpha</code> is added to each
160      * entry in the original array prior to normalization.
161      * @param counts the array to be converted into a probability distribution
162      * @param alpha the value to add to each entry prior to normalization
163      */
164     public static void normalize(double[] counts, double alpha)
165     {
166         double total_count = 0;
167 
168         for (int i = 0; i < counts.length; i++)
169             total_count += counts[i];
170 
171         for (int i = 0; i < counts.length; i++)
172             counts[i] = (counts[i] + alpha)
173                     / (total_count + counts.length * alpha);
174     }
175 
176     /**
177      * Returns the mean of the specified <code>Collection</code> of
178      * distributions, which are assumed to be normalized arrays of 
179      * <code>double</code> values.
180      * @see #mean(double[][])
181      * @param distributions the distributions whose mean is to be calculated
182      * @return the mean of the distributions
183      */
184     public static double[] mean(Collection<double[]> distributions)
185     {
186         if (distributions.isEmpty())
187             throw new IllegalArgumentException("Distribution collection must be non-empty");
188         Iterator<double[]> iter = distributions.iterator();
189         double[] first = iter.next();
190         double[][] d_array = new double[distributions.size()][first.length];
191         d_array[0] = first;
192         for (int i = 1; i < d_array.length; i++)
193             d_array[i] = iter.next();
194         
195         return mean(d_array);
196     }
197     
198     /**
199      * Returns the mean of the specified array of distributions,
200      * represented as normalized arrays of <code>double</code> values.
201      * Will throw an "index out of bounds" exception if the 
202      * distribution arrays are not all of the same length.
203      * @param distributions the distributions whose mean is to be calculated
204      * @return the mean of the distributions
205      */
206     public static double[] mean(double[][] distributions)
207     {
208         double[] d_mean = new double[distributions[0].length];
209         for (int j = 0; j < d_mean.length; j++)
210             d_mean[j] = 0;
211             
212         for (int i = 0; i < distributions.length; i++)
213             for (int j = 0; j < d_mean.length; j++)
214                 d_mean[j] += distributions[i][j] / distributions.length;
215         
216         return d_mean;
217     }
218     
219 }