1 /*
2 * Copyright (c) 2004, The JUNG Authors
3 *
4 * All rights reserved.
5 *
6 * This software is open-source under the BSD license; see either
7 * "license.txt" or
8 * https://github.com/jrtom/jung/blob/master/LICENSE for a description.
9 *
10 * Created on Aug 12, 2004
11 */
12 package edu.uci.ics.jung.algorithms.cluster;
13
14 import edu.uci.ics.jung.algorithms.scoring.VoltageScorer;
15 import edu.uci.ics.jung.algorithms.util.DiscreteDistribution;
16 import edu.uci.ics.jung.algorithms.util.KMeansClusterer;
17 import edu.uci.ics.jung.algorithms.util.KMeansClusterer.NotEnoughClustersException;
18 import edu.uci.ics.jung.graph.Graph;
19
20 import java.util.ArrayList;
21 import java.util.Collection;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.Iterator;
27 import java.util.LinkedList;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Random;
31 import java.util.Set;
32
33 /**
34 * <p>Clusters vertices of a <code>Graph</code> based on their ranks as
35 * calculated by <code>VoltageScorer</code>. This algorithm is based on,
36 * but not identical with, the method described in the paper below.
37 * The primary difference is that Wu and Huberman assume a priori that the clusters
38 * are of approximately the same size, and therefore use a more complex
39 * method than k-means (which is used here) for determining cluster
40 * membership based on co-occurrence data.
41 *
42 * <p>The algorithm proceeds as follows:
43 * <ul>
44 * <li>first, generate a set of candidate clusters as follows:
45 * <ul>
46 * <li>pick (widely separated) vertex pair, run VoltageScorer
47 * <li>group the vertices in two clusters according to their voltages
48 * <li>store resulting candidate clusters
49 * </ul>
50 * <li>second, generate k-1 clusters as follows:
51 * <ul>
52 * <li>pick a vertex v as a cluster 'seed'
53 * <br>(Wu/Huberman: most frequent vertex in candidate clusters)
54 * <li>calculate co-occurrence over all candidate clusters of v with each other
55 * vertex
56 * <li>separate co-occurrence counts into high/low;
57 * high vertices constitute a cluster
58 * <li>remove v's vertices from candidate clusters; continue
59 * </ul>
60 * <li>finally, remaining unassigned vertices are assigned to the kth ("garbage")
61 * cluster.
62 * </ul>
63 *
64 * <p><b>NOTE</b>: Depending on how the co-occurrence data splits the data into
65 * clusters, the number of clusters returned by this algorithm may be less than the
66 * number of clusters requested. The number of clusters will never be more than
67 * the number requested, however.
68 *
69 * @author Joshua O'Madadhain
70 * @see "'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman, http://www.hpl.hp.com/research/idl/papers/linear/"
71 * @see VoltageScorer
72 * @see KMeansClusterer
73 */
74 public class VoltageClusterer<V,E>
75 {
76 protected int num_candidates;
77 protected KMeansClusterer<V> kmc;
78 protected Random rand;
79 protected Graph<V,E> g;
80
81 /**
82 * Creates an instance of a VoltageCluster with the specified parameters.
83 * These are mostly parameters that are passed directly to VoltageScorer
84 * and KMeansClusterer.
85 *
86 * @param g the graph whose vertices are to be clustered
87 * @param num_candidates the number of candidate clusters to create
88 */
89 public VoltageClusterer(Graph<V,E> g, int num_candidates)
90 {
91 if (num_candidates < 1)
92 throw new IllegalArgumentException("must generate >=1 candidates");
93
94 this.num_candidates = num_candidates;
95 this.kmc = new KMeansClusterer<V>();
96 rand = new Random();
97 this.g = g;
98 }
99
100 protected void setRandomSeed(int random_seed)
101 {
102 rand = new Random(random_seed);
103 }
104
105 /**
106 * @param v the vertex whose community we wish to discover
107 * @return a community (cluster) centered around <code>v</code>.
108 */
109 public Collection<Set<V>> getCommunity(V v)
110 {
111 return cluster_internal(v, 2);
112 }
113
114 /**
115 * Clusters the vertices of <code>g</code> into
116 * <code>num_clusters</code> clusters, based on their connectivity.
117 * @param num_clusters the number of clusters to identify
118 * @return a collection of clusters (sets of vertices)
119 */
120 public Collection<Set<V>> cluster(int num_clusters)
121 {
122 return cluster_internal(null, num_clusters);
123 }
124
125 /**
126 * Does the work of <code>getCommunity</code> and <code>cluster</code>.
127 * @param origin the vertex around which clustering is to be done
128 * @param num_clusters the (maximum) number of clusters to find
129 * @return a collection of clusters (sets of vertices)
130 */
131 protected Collection<Set<V>> cluster_internal(V origin, int num_clusters)
132 {
133 // generate candidate clusters
134 // repeat the following 'samples' times:
135 // * pick (widely separated) vertex pair, run VoltageScorer
136 // * use k-means to identify 2 communities in ranked graph
137 // * store resulting candidate communities
138 ArrayList<V> v_array = new ArrayList<V>(g.getVertices());
139
140 LinkedList<Set<V>> candidates = new LinkedList<Set<V>>();
141
142 for (int j = 0; j < num_candidates; j++)
143 {
144 V source;
145 if (origin == null)
146 source = v_array.get((int)(rand.nextDouble() * v_array.size()));
147 else
148 source = origin;
149 V target = null;
150 do
151 {
152 target = v_array.get((int)(rand.nextDouble() * v_array.size()));
153 }
154 while (source == target);
155 VoltageScorer<V,E> vs = new VoltageScorer<V,E>(g, source, target);
156 vs.evaluate();
157
158 Map<V, double[]> voltage_ranks = new HashMap<V, double[]>();
159 for (V v : g.getVertices())
160 voltage_ranks.put(v, new double[] {vs.getVertexScore(v)});
161
162 // addOneCandidateCluster(candidates, voltage_ranks);
163 addTwoCandidateClusters(candidates, voltage_ranks);
164 }
165
166 // repeat the following k-1 times:
167 // * pick a vertex v as a cluster seed
168 // (Wu/Huberman: most frequent vertex in candidates)
169 // * calculate co-occurrence (in candidate clusters)
170 // of this vertex with all others
171 // * use k-means to separate co-occurrence counts into high/low;
172 // high vertices are a cluster
173 // * remove v's vertices from candidate clusters
174
175 Collection<Set<V>> clusters = new LinkedList<Set<V>>();
176 Set<V> remaining = new HashSet<V>(g.getVertices());
177
178 List<V> seed_candidates = getSeedCandidates(candidates);
179 int seed_index = 0;
180
181 for (int j = 0; j < (num_clusters - 1); j++)
182 {
183 if (remaining.isEmpty())
184 break;
185
186 V seed;
187 if (seed_index == 0 && origin != null)
188 seed = origin;
189 else
190 {
191 do { seed = seed_candidates.get(seed_index++); }
192 while (!remaining.contains(seed));
193 }
194
195 Map<V, double[]> occur_counts = getObjectCounts(candidates, seed);
196 if (occur_counts.size() < 2)
197 break;
198
199 // now that we have the counts, cluster them...
200 try
201 {
202 Collection<Map<V, double[]>> high_low = kmc.cluster(occur_counts, 2);
203 // ...get the cluster with the highest-valued centroid...
204 Iterator<Map<V, double[]>> h_iter = high_low.iterator();
205 Map<V, double[]> cluster1 = h_iter.next();
206 Map<V, double[]> cluster2 = h_iter.next();
207 double[] centroid1 = DiscreteDistribution.mean(cluster1.values());
208 double[] centroid2 = DiscreteDistribution.mean(cluster2.values());
209 Set<V> new_cluster;
210 if (centroid1[0] >= centroid2[0])
211 new_cluster = cluster1.keySet();
212 else
213 new_cluster = cluster2.keySet();
214
215 // ...remove the elements of new_cluster from each candidate...
216 for (Set<V> cluster : candidates)
217 cluster.removeAll(new_cluster);
218 clusters.add(new_cluster);
219 remaining.removeAll(new_cluster);
220 }
221 catch (NotEnoughClustersException nece)
222 {
223 // all remaining vertices are in the same cluster
224 break;
225 }
226 }
227
228 // identify remaining vertices (if any) as a 'garbage' cluster
229 if (!remaining.isEmpty())
230 clusters.add(remaining);
231
232 return clusters;
233 }
234
235 /**
236 * Do k-means with three intervals and pick the smaller two clusters
237 * (presumed to be on the ends); this is closer to the Wu-Huberman method.
238 * @param candidates the list of clusters to populate
239 * @param voltage_ranks the voltage values for each vertex
240 */
241 protected void addTwoCandidateClusters(LinkedList<Set<V>> candidates,
242 Map<V, double[]> voltage_ranks)
243 {
244 try
245 {
246 List<Map<V, double[]>> clusters = new ArrayList<Map<V, double[]>>(kmc.cluster(voltage_ranks, 3));
247 boolean b01 = clusters.get(0).size() > clusters.get(1).size();
248 boolean b02 = clusters.get(0).size() > clusters.get(2).size();
249 boolean b12 = clusters.get(1).size() > clusters.get(2).size();
250 if (b01 && b02)
251 {
252 candidates.add(clusters.get(1).keySet());
253 candidates.add(clusters.get(2).keySet());
254 }
255 else if (!b01 && b12)
256 {
257 candidates.add(clusters.get(0).keySet());
258 candidates.add(clusters.get(2).keySet());
259 }
260 else if (!b02 && !b12)
261 {
262 candidates.add(clusters.get(0).keySet());
263 candidates.add(clusters.get(1).keySet());
264 }
265 }
266 catch (NotEnoughClustersException e)
267 {
268 // no valid candidates, continue
269 }
270 }
271
272 /**
273 * alternative to addTwoCandidateClusters(): cluster vertices by voltages into 2 clusters.
274 * We only consider the smaller of the two clusters returned
275 * by k-means to be a 'true' cluster candidate; the other is a garbage cluster.
276 * @param candidates the list of clusters to populate
277 * @param voltage_ranks the voltage values for each vertex
278 */
279 protected void addOneCandidateCluster(LinkedList<Set<V>> candidates,
280 Map<V, double[]> voltage_ranks)
281 {
282 try
283 {
284 List<Map<V, double[]>> clusters;
285 clusters = new ArrayList<Map<V, double[]>>(kmc.cluster(voltage_ranks, 2));
286 if (clusters.get(0).size() < clusters.get(1).size())
287 candidates.add(clusters.get(0).keySet());
288 else
289 candidates.add(clusters.get(1).keySet());
290 }
291 catch (NotEnoughClustersException e)
292 {
293 // no valid candidates, continue
294 }
295 }
296
297 /**
298 * Returns a list of cluster seeds, ranked in decreasing order
299 * of number of appearances in the specified collection of candidate
300 * clusters.
301 * @param candidates the set of candidate clusters
302 * @return a set of cluster seeds
303 */
304 protected List<V> getSeedCandidates(Collection<Set<V>> candidates)
305 {
306 final Map<V, double[]> occur_counts = getObjectCounts(candidates, null);
307
308 ArrayList<V> occurrences = new ArrayList<V>(occur_counts.keySet());
309 Collections.sort(occurrences, new MapValueArrayComparator(occur_counts));
310
311 // System.out.println("occurrences: ");
312 for (int i = 0; i < occurrences.size(); i++)
313 System.out.println(occur_counts.get(occurrences.get(i))[0]);
314
315 return occurrences;
316 }
317
318 protected Map<V, double[]> getObjectCounts(Collection<Set<V>> candidates, V seed)
319 {
320 Map<V, double[]> occur_counts = new HashMap<V, double[]>();
321 for (V v : g.getVertices())
322 occur_counts.put(v, new double[]{0});
323
324 for (Set<V> candidate : candidates)
325 {
326 if (seed == null)
327 System.out.println(candidate.size());
328 if (seed == null || candidate.contains(seed))
329 {
330 for (V element : candidate)
331 {
332 double[] count = occur_counts.get(element);
333 count[0]++;
334 }
335 }
336 }
337
338 if (seed == null)
339 {
340 System.out.println("occur_counts size: " + occur_counts.size());
341 for (V v : occur_counts.keySet())
342 System.out.println(occur_counts.get(v)[0]);
343 }
344
345 return occur_counts;
346 }
347
348 protected class MapValueArrayComparator implements Comparator<V>
349 {
350 private Map<V, double[]> map;
351
352 protected MapValueArrayComparator(Map<V, double[]> map)
353 {
354 this.map = map;
355 }
356
357 public int compare(V o1, V o2)
358 {
359 double[] count0 = map.get(o1);
360 double[] count1 = map.get(o2);
361 if (count0[0] < count1[0])
362 return 1;
363 else if (count0[0] > count1[0])
364 return -1;
365 return 0;
366 }
367
368 }
369
370 }