001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.math3.stat.clustering;
018
019 import java.util.ArrayList;
020 import java.util.Collection;
021 import java.util.HashMap;
022 import java.util.HashSet;
023 import java.util.List;
024 import java.util.Map;
025 import java.util.Set;
026
027 import org.apache.commons.math3.exception.NotPositiveException;
028 import org.apache.commons.math3.exception.NullArgumentException;
029 import org.apache.commons.math3.util.MathUtils;
030
031 /**
032 * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
033 * <p>
034 * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
035 * a point p is density connected to another point q, if there exists a chain of
036 * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
037 * such that each pair <p<sub>i</sub>, p<sub>i+1</sub>> is directly density-reachable.
038 * A point q is directly density-reachable from point p if it is in the ε-neighborhood
039 * of this point.
040 * <p>
041 * Any point that is not density-reachable from a formed cluster is treated as noise, and
042 * will thus not be present in the result.
043 * <p>
044 * The algorithm requires two parameters:
045 * <ul>
046 * <li>eps: the distance that defines the ε-neighborhood of a point
047 * <li>minPoints: the minimum number of density-connected points required to form a cluster
048 * </ul>
049 * <p>
050 * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
051 * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
052 * return {@code null}.
053 *
054 * @param <T> type of the points to cluster
055 * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
056 * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
057 * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
058 * @version $Id: DBSCANClusterer.java 1410882 2012-11-18 12:49:49Z tn $
059 * @since 3.1
060 */
061 public class DBSCANClusterer<T extends Clusterable<T>> {
062
063 /** Maximum radius of the neighborhood to be considered. */
064 private final double eps;
065
066 /** Minimum number of points needed for a cluster. */
067 private final int minPts;
068
069 /** Status of a point during the clustering process. */
070 private enum PointStatus {
071 /** The point has is considered to be noise. */
072 NOISE,
073 /** The point is already part of a cluster. */
074 PART_OF_CLUSTER
075 }
076
077 /**
078 * Creates a new instance of a DBSCANClusterer.
079 *
080 * @param eps maximum radius of the neighborhood to be considered
081 * @param minPts minimum number of points needed for a cluster
082 * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
083 */
084 public DBSCANClusterer(final double eps, final int minPts)
085 throws NotPositiveException {
086 if (eps < 0.0d) {
087 throw new NotPositiveException(eps);
088 }
089 if (minPts < 0) {
090 throw new NotPositiveException(minPts);
091 }
092 this.eps = eps;
093 this.minPts = minPts;
094 }
095
096 /**
097 * Returns the maximum radius of the neighborhood to be considered.
098 *
099 * @return maximum radius of the neighborhood
100 */
101 public double getEps() {
102 return eps;
103 }
104
105 /**
106 * Returns the minimum number of points needed for a cluster.
107 *
108 * @return minimum number of points needed for a cluster
109 */
110 public int getMinPts() {
111 return minPts;
112 }
113
114 /**
115 * Performs DBSCAN cluster analysis.
116 * <p>
117 * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
118 * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
119 * return {@code null}.
120 *
121 * @param points the points to cluster
122 * @return the list of clusters
123 * @throws NullArgumentException if the data points are null
124 */
125 public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException {
126
127 // sanity checks
128 MathUtils.checkNotNull(points);
129
130 final List<Cluster<T>> clusters = new ArrayList<Cluster<T>>();
131 final Map<Clusterable<T>, PointStatus> visited = new HashMap<Clusterable<T>, PointStatus>();
132
133 for (final T point : points) {
134 if (visited.get(point) != null) {
135 continue;
136 }
137 final List<T> neighbors = getNeighbors(point, points);
138 if (neighbors.size() >= minPts) {
139 // DBSCAN does not care about center points
140 final Cluster<T> cluster = new Cluster<T>(null);
141 clusters.add(expandCluster(cluster, point, neighbors, points, visited));
142 } else {
143 visited.put(point, PointStatus.NOISE);
144 }
145 }
146
147 return clusters;
148 }
149
150 /**
151 * Expands the cluster to include density-reachable items.
152 *
153 * @param cluster Cluster to expand
154 * @param point Point to add to cluster
155 * @param neighbors List of neighbors
156 * @param points the data set
157 * @param visited the set of already visited points
158 * @return the expanded cluster
159 */
160 private Cluster<T> expandCluster(final Cluster<T> cluster,
161 final T point,
162 final List<T> neighbors,
163 final Collection<T> points,
164 final Map<Clusterable<T>, PointStatus> visited) {
165 cluster.addPoint(point);
166 visited.put(point, PointStatus.PART_OF_CLUSTER);
167
168 List<T> seeds = new ArrayList<T>(neighbors);
169 int index = 0;
170 while (index < seeds.size()) {
171 final T current = seeds.get(index);
172 PointStatus pStatus = visited.get(current);
173 // only check non-visited points
174 if (pStatus == null) {
175 final List<T> currentNeighbors = getNeighbors(current, points);
176 if (currentNeighbors.size() >= minPts) {
177 seeds = merge(seeds, currentNeighbors);
178 }
179 }
180
181 if (pStatus != PointStatus.PART_OF_CLUSTER) {
182 visited.put(current, PointStatus.PART_OF_CLUSTER);
183 cluster.addPoint(current);
184 }
185
186 index++;
187 }
188 return cluster;
189 }
190
191 /**
192 * Returns a list of density-reachable neighbors of a {@code point}.
193 *
194 * @param point the point to look for
195 * @param points possible neighbors
196 * @return the List of neighbors
197 */
198 private List<T> getNeighbors(final T point, final Collection<T> points) {
199 final List<T> neighbors = new ArrayList<T>();
200 for (final T neighbor : points) {
201 if (point != neighbor && neighbor.distanceFrom(point) <= eps) {
202 neighbors.add(neighbor);
203 }
204 }
205 return neighbors;
206 }
207
208 /**
209 * Merges two lists together.
210 *
211 * @param one first list
212 * @param two second list
213 * @return merged lists
214 */
215 private List<T> merge(final List<T> one, final List<T> two) {
216 final Set<T> oneSet = new HashSet<T>(one);
217 for (T item : two) {
218 if (!oneSet.contains(item)) {
219 one.add(item);
220 }
221 }
222 return one;
223 }
224 }