View Javadoc
1   /*
2    * Copyright (c) 2004, The JUNG Authors
3    *
4    * All rights reserved.
5    *
6    * This software is open-source under the BSD license; see either
7    * "license.txt" or
8    * https://github.com/jrtom/jung/blob/master/LICENSE for a description.
9    *
10   * Created on Aug 12, 2004
11   */
12  package edu.uci.ics.jung.algorithms.cluster;
13  
14  import edu.uci.ics.jung.algorithms.scoring.VoltageScorer;
15  import edu.uci.ics.jung.algorithms.util.DiscreteDistribution;
16  import edu.uci.ics.jung.algorithms.util.KMeansClusterer;
17  import edu.uci.ics.jung.algorithms.util.KMeansClusterer.NotEnoughClustersException;
18  import edu.uci.ics.jung.graph.Graph;
19  
20  import java.util.ArrayList;
21  import java.util.Collection;
22  import java.util.Collections;
23  import java.util.Comparator;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.LinkedList;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Random;
31  import java.util.Set;
32  
33  /**
34   * <p>Clusters vertices of a <code>Graph</code> based on their ranks as
35   * calculated by <code>VoltageScorer</code>.  This algorithm is based on,
36   * but not identical with, the method described in the paper below.
37   * The primary difference is that Wu and Huberman assume a priori that the clusters
38   * are of approximately the same size, and therefore use a more complex
39   * method than k-means (which is used here) for determining cluster
40   * membership based on co-occurrence data.
41   *
42   * <p>The algorithm proceeds as follows:
43   * <ul>
44   * <li>first, generate a set of candidate clusters as follows:
45   *      <ul>
46   *      <li>pick (widely separated) vertex pair, run VoltageScorer
47   *      <li>group the vertices in two clusters according to their voltages
48   *      <li>store resulting candidate clusters
49   *      </ul>
50   * <li>second, generate k-1 clusters as follows:
51   *      <ul>
52   *      <li>pick a vertex v as a cluster 'seed'
53   *           <br>(Wu/Huberman: most frequent vertex in candidate clusters)
54   *      <li>calculate co-occurrence over all candidate clusters of v with each other
55   *           vertex
56   *      <li>separate co-occurrence counts into high/low;
57   *           high vertices constitute a cluster
58   *      <li>remove v's vertices from candidate clusters; continue
59   *      </ul>
60   * <li>finally, remaining unassigned vertices are assigned to the kth ("garbage")
61   * cluster.
62   * </ul>
63   *
64   * <p><b>NOTE</b>: Depending on how the co-occurrence data splits the data into
65   * clusters, the number of clusters returned by this algorithm may be less than the
66   * number of clusters requested.  The number of clusters will never be more than
67   * the number requested, however.
68   *
69   * @author Joshua O'Madadhain
70   * @see "'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman, http://www.hpl.hp.com/research/idl/papers/linear/"
71   * @see VoltageScorer
72   * @see KMeansClusterer
73   */
74  public class VoltageClusterer<V,E>
75  {
76      protected int num_candidates;
77      protected KMeansClusterer<V> kmc;
78      protected Random rand;
79      protected Graph<V,E> g;
80  
81      /**
82       * Creates an instance of a VoltageCluster with the specified parameters.
83       * These are mostly parameters that are passed directly to VoltageScorer
84       * and KMeansClusterer.
85       * 
86       * @param g the graph whose vertices are to be clustered
87       * @param num_candidates    the number of candidate clusters to create
88       */
89      public VoltageClusterer(Graph<V,E> g, int num_candidates)
90      {
91          if (num_candidates < 1)
92              throw new IllegalArgumentException("must generate >=1 candidates");
93  
94          this.num_candidates = num_candidates;
95          this.kmc = new KMeansClusterer<V>();
96          rand = new Random();
97          this.g = g;
98      }
99  
100     protected void setRandomSeed(int random_seed)
101     {
102         rand = new Random(random_seed);
103     }
104 
105     /**
106      * @param v the vertex whose community we wish to discover
107      * @return a community (cluster) centered around <code>v</code>.
108      */
109     public Collection<Set<V>> getCommunity(V v)
110     {
111         return cluster_internal(v, 2);
112     }
113 
114     /**
115      * Clusters the vertices of <code>g</code> into
116      * <code>num_clusters</code> clusters, based on their connectivity.
117      * @param num_clusters  the number of clusters to identify
118      * @return a collection of clusters (sets of vertices)
119      */
120     public Collection<Set<V>> cluster(int num_clusters)
121     {
122         return cluster_internal(null, num_clusters);
123     }
124 
125     /**
126      * Does the work of <code>getCommunity</code> and <code>cluster</code>.
127      * @param origin the vertex around which clustering is to be done
128      * @param num_clusters the (maximum) number of clusters to find
129      * @return a collection of clusters (sets of vertices)
130      */
131     protected Collection<Set<V>> cluster_internal(V origin, int num_clusters)
132     {
133         // generate candidate clusters
134         // repeat the following 'samples' times:
135         // * pick (widely separated) vertex pair, run VoltageScorer
136         // * use k-means to identify 2 communities in ranked graph
137         // * store resulting candidate communities
138         ArrayList<V> v_array = new ArrayList<V>(g.getVertices());
139 
140         LinkedList<Set<V>> candidates = new LinkedList<Set<V>>();
141 
142         for (int j = 0; j < num_candidates; j++)
143         {
144             V source;
145             if (origin == null)
146                 source = v_array.get((int)(rand.nextDouble() * v_array.size()));
147             else
148                 source = origin;
149             V target = null;
150             do
151             {
152                 target = v_array.get((int)(rand.nextDouble() * v_array.size()));
153             }
154             while (source == target);
155             VoltageScorer<V,E> vs = new VoltageScorer<V,E>(g, source, target);
156             vs.evaluate();
157 
158             Map<V, double[]> voltage_ranks = new HashMap<V, double[]>();
159             for (V v : g.getVertices())
160                 voltage_ranks.put(v, new double[] {vs.getVertexScore(v)});
161 
162 //            addOneCandidateCluster(candidates, voltage_ranks);
163             addTwoCandidateClusters(candidates, voltage_ranks);
164         }
165 
166         // repeat the following k-1 times:
167         // * pick a vertex v as a cluster seed
168         //   (Wu/Huberman: most frequent vertex in candidates)
169         // * calculate co-occurrence (in candidate clusters)
170         //   of this vertex with all others
171         // * use k-means to separate co-occurrence counts into high/low;
172         //   high vertices are a cluster
173         // * remove v's vertices from candidate clusters
174 
175         Collection<Set<V>> clusters = new LinkedList<Set<V>>();
176         Set<V> remaining = new HashSet<V>(g.getVertices());
177 
178         List<V> seed_candidates = getSeedCandidates(candidates);
179         int seed_index = 0;
180 
181         for (int j = 0; j < (num_clusters - 1); j++)
182         {
183             if (remaining.isEmpty())
184                 break;
185 
186             V seed;
187             if (seed_index == 0 && origin != null)
188                 seed = origin;
189             else
190             {
191                 do { seed = seed_candidates.get(seed_index++); }
192                 while (!remaining.contains(seed));
193             }
194 
195             Map<V, double[]> occur_counts = getObjectCounts(candidates, seed);
196             if (occur_counts.size() < 2)
197                 break;
198 
199             // now that we have the counts, cluster them...
200             try
201             {
202                 Collection<Map<V, double[]>> high_low = kmc.cluster(occur_counts, 2);
203                 // ...get the cluster with the highest-valued centroid...
204                 Iterator<Map<V, double[]>> h_iter = high_low.iterator();
205                 Map<V, double[]> cluster1 = h_iter.next();
206                 Map<V, double[]> cluster2 = h_iter.next();
207                 double[] centroid1 = DiscreteDistribution.mean(cluster1.values());
208                 double[] centroid2 = DiscreteDistribution.mean(cluster2.values());
209                 Set<V> new_cluster;
210                 if (centroid1[0] >= centroid2[0])
211                     new_cluster = cluster1.keySet();
212                 else
213                     new_cluster = cluster2.keySet();
214 
215                 // ...remove the elements of new_cluster from each candidate...
216                 for (Set<V> cluster : candidates)
217                     cluster.removeAll(new_cluster);
218                 clusters.add(new_cluster);
219                 remaining.removeAll(new_cluster);
220             }
221             catch (NotEnoughClustersException nece)
222             {
223                 // all remaining vertices are in the same cluster
224                 break;
225             }
226         }
227 
228         // identify remaining vertices (if any) as a 'garbage' cluster
229         if (!remaining.isEmpty())
230             clusters.add(remaining);
231 
232         return clusters;
233     }
234 
235     /**
236      * Do k-means with three intervals and pick the smaller two clusters 
237      * (presumed to be on the ends); this is closer to the Wu-Huberman method.
238      * @param candidates the list of clusters to populate
239      * @param voltage_ranks the voltage values for each vertex
240      */
241     protected void addTwoCandidateClusters(LinkedList<Set<V>> candidates,
242             Map<V, double[]> voltage_ranks)
243     {
244         try
245         {
246             List<Map<V, double[]>> clusters = new ArrayList<Map<V, double[]>>(kmc.cluster(voltage_ranks, 3));
247             boolean b01 = clusters.get(0).size() > clusters.get(1).size();
248             boolean b02 = clusters.get(0).size() > clusters.get(2).size();
249             boolean b12 = clusters.get(1).size() > clusters.get(2).size();
250             if (b01 && b02)
251             {
252                 candidates.add(clusters.get(1).keySet());
253                 candidates.add(clusters.get(2).keySet());
254             }
255             else if (!b01 && b12)
256             {
257                 candidates.add(clusters.get(0).keySet());
258                 candidates.add(clusters.get(2).keySet());
259             }
260             else if (!b02 && !b12)
261             {
262                 candidates.add(clusters.get(0).keySet());
263                 candidates.add(clusters.get(1).keySet());
264             }
265         }
266         catch (NotEnoughClustersException e)
267         {
268             // no valid candidates, continue
269         }
270     }
271 
272     /**
273      * alternative to addTwoCandidateClusters(): cluster vertices by voltages into 2 clusters.
274      * We only consider the smaller of the two clusters returned
275      * by k-means to be a 'true' cluster candidate; the other is a garbage cluster.
276      * @param candidates the list of clusters to populate
277      * @param voltage_ranks the voltage values for each vertex
278      */
279     protected void addOneCandidateCluster(LinkedList<Set<V>> candidates,
280             Map<V, double[]> voltage_ranks)
281     {
282         try
283         {
284             List<Map<V, double[]>> clusters;
285             clusters = new ArrayList<Map<V, double[]>>(kmc.cluster(voltage_ranks, 2));
286             if (clusters.get(0).size() < clusters.get(1).size())
287                 candidates.add(clusters.get(0).keySet());
288             else
289                 candidates.add(clusters.get(1).keySet());
290         }
291         catch (NotEnoughClustersException e)
292         {
293             // no valid candidates, continue
294         }
295     }
296 
297     /**
298      * Returns a list of cluster seeds, ranked in decreasing order
299      * of number of appearances in the specified collection of candidate
300      * clusters.
301      * @param candidates the set of candidate clusters
302      * @return a set of cluster seeds
303      */
304     protected List<V> getSeedCandidates(Collection<Set<V>> candidates)
305     {
306         final Map<V, double[]> occur_counts = getObjectCounts(candidates, null);
307 
308         ArrayList<V> occurrences = new ArrayList<V>(occur_counts.keySet());
309         Collections.sort(occurrences, new MapValueArrayComparator(occur_counts));
310 
311 //        System.out.println("occurrences: ");
312         for (int i = 0; i < occurrences.size(); i++)
313             System.out.println(occur_counts.get(occurrences.get(i))[0]);
314 
315         return occurrences;
316     }
317 
318     protected Map<V, double[]> getObjectCounts(Collection<Set<V>> candidates, V seed)
319     {
320         Map<V, double[]> occur_counts = new HashMap<V, double[]>();
321         for (V v : g.getVertices())
322             occur_counts.put(v, new double[]{0});
323 
324         for (Set<V> candidate : candidates)
325         {
326             if (seed == null)
327                 System.out.println(candidate.size());
328             if (seed == null || candidate.contains(seed))
329             {
330                 for (V element : candidate)
331                 {
332                     double[] count = occur_counts.get(element);
333                     count[0]++;
334                 }
335             }
336         }
337 
338         if (seed == null)
339         {
340             System.out.println("occur_counts size: " + occur_counts.size());
341             for (V v : occur_counts.keySet())
342                 System.out.println(occur_counts.get(v)[0]);
343         }
344 
345         return occur_counts;
346     }
347 
348     protected class MapValueArrayComparator implements Comparator<V>
349     {
350         private Map<V, double[]> map;
351 
352         protected MapValueArrayComparator(Map<V, double[]> map)
353         {
354             this.map = map;
355         }
356 
357         public int compare(V o1, V o2)
358         {
359             double[] count0 = map.get(o1);
360             double[] count1 = map.get(o2);
361             if (count0[0] < count1[0])
362                 return 1;
363             else if (count0[0] > count1[0])
364                 return -1;
365             return 0;
366         }
367 
368     }
369 
370 }