1dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond/* 2dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Licensed to the Apache Software Foundation (ASF) under one or more 3dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * contributor license agreements. See the NOTICE file distributed with 4dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * this work for additional information regarding copyright ownership. 5dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * The ASF licenses this file to You under the Apache License, Version 2.0 6dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * (the "License"); you may not use this file except in compliance with 7dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * the License. You may obtain a copy of the License at 8dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * 9dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * http://www.apache.org/licenses/LICENSE-2.0 10dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * 11dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Unless required by applicable law or agreed to in writing, software 12dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * distributed under the License is distributed on an "AS IS" BASIS, 13dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * See the License for the specific language governing permissions and 15dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * limitations under the License. 16dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */ 17dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 18dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondpackage org.apache.commons.math.stat.clustering; 19dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 20dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport java.util.ArrayList; 21dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport java.util.Collection; 22dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport java.util.List; 23dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport java.util.Random; 24dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 25dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport org.apache.commons.math.exception.ConvergenceException; 26dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport org.apache.commons.math.exception.util.LocalizedFormats; 27dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport org.apache.commons.math.stat.descriptive.moment.Variance; 28dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 29dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond/** 30dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Clustering algorithm based on David Arthur and Sergei Vassilvitski k-means++ algorithm. 31dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param <T> type of the points to cluster 32dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @see <a href="http://en.wikipedia.org/wiki/K-means%2B%2B">K-means++ (wikipedia)</a> 33dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @version $Revision: 1054333 $ $Date: 2011-01-02 01:34:58 +0100 (dim. 02 janv. 2011) $ 34dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @since 2.0 35dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */ 36dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondpublic class KMeansPlusPlusClusterer<T extends Clusterable<T>> { 37dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 38dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** Strategies to use for replacing an empty cluster. */ 39dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond public static enum EmptyClusterStrategy { 40dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 41dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** Split the cluster with largest distance variance. */ 42dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond LARGEST_VARIANCE, 43dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 44dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** Split the cluster with largest number of points. */ 45dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond LARGEST_POINTS_NUMBER, 46dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 47dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** Create a cluster around the point farthest from its centroid. */ 48dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond FARTHEST_POINT, 49dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 50dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** Generate an error. */ 51dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond ERROR 52dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 53dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 54dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 55dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** Random generator for choosing initial centers. */ 56dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond private final Random random; 57dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 58dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** Selected strategy for empty clusters. */ 59dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond private final EmptyClusterStrategy emptyStrategy; 60dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 61dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** Build a clusterer. 62dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * <p> 63dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * The default strategy for handling empty clusters that may appear during 64dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * algorithm iterations is to split the cluster with largest distance variance. 65dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * </p> 66dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param random random generator to use for choosing initial centers 67dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */ 68dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond public KMeansPlusPlusClusterer(final Random random) { 69dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond this(random, EmptyClusterStrategy.LARGEST_VARIANCE); 70dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 71dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 72dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** Build a clusterer. 73dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param random random generator to use for choosing initial centers 74dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param emptyStrategy strategy to use for handling empty clusters that 75dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * may appear during algorithm iterations 76dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @since 2.2 77dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */ 78dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond public KMeansPlusPlusClusterer(final Random random, final EmptyClusterStrategy emptyStrategy) { 79dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond this.random = random; 80dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond this.emptyStrategy = emptyStrategy; 81dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 82dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 83dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** 84dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Runs the K-means++ clustering algorithm. 85dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * 86dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param points the points to cluster 87dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param k the number of clusters to split the data into 88dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param maxIterations the maximum number of iterations to run the algorithm 89dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * for. If negative, no maximum will be used 90dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @return a list of clusters containing the points 91dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */ 92dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond public List<Cluster<T>> cluster(final Collection<T> points, 93dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final int k, final int maxIterations) { 94dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // create the initial clusters 95dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond List<Cluster<T>> clusters = chooseInitialCenters(points, k, random); 96dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond assignPointsToClusters(clusters, points); 97dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 98dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // iterate through updating the centers until we're done 99dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final int max = (maxIterations < 0) ? Integer.MAX_VALUE : maxIterations; 100dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond for (int count = 0; count < max; count++) { 101dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond boolean clusteringChanged = false; 102dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond List<Cluster<T>> newClusters = new ArrayList<Cluster<T>>(); 103dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond for (final Cluster<T> cluster : clusters) { 104dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final T newCenter; 105dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (cluster.getPoints().isEmpty()) { 106dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond switch (emptyStrategy) { 107dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond case LARGEST_VARIANCE : 108dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond newCenter = getPointFromLargestVarianceCluster(clusters); 109dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond break; 110dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond case LARGEST_POINTS_NUMBER : 111dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond newCenter = getPointFromLargestNumberCluster(clusters); 112dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond break; 113dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond case FARTHEST_POINT : 114dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond newCenter = getFarthestPoint(clusters); 115dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond break; 116dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond default : 117dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); 118dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 119dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond clusteringChanged = true; 120dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } else { 121dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond newCenter = cluster.getCenter().centroidOf(cluster.getPoints()); 122dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (!newCenter.equals(cluster.getCenter())) { 123dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond clusteringChanged = true; 124dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 125dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 126dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond newClusters.add(new Cluster<T>(newCenter)); 127dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 128dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (!clusteringChanged) { 129dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond return clusters; 130dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 131dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond assignPointsToClusters(newClusters, points); 132dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond clusters = newClusters; 133dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 134dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond return clusters; 135dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 136dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 137dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** 138dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Adds the given points to the closest {@link Cluster}. 139dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * 140dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param <T> type of the points to cluster 141dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param clusters the {@link Cluster}s to add the points to 142dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param points the points to add to the given {@link Cluster}s 143dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */ 144dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond private static <T extends Clusterable<T>> void 145dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond assignPointsToClusters(final Collection<Cluster<T>> clusters, final Collection<T> points) { 146dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond for (final T p : points) { 147dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond Cluster<T> cluster = getNearestCluster(clusters, p); 148dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond cluster.addPoint(p); 149dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 150dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 151dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 152dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** 153dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Use K-means++ to choose the initial centers. 154dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * 155dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param <T> type of the points to cluster 156dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param points the points to choose the initial centers from 157dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param k the number of centers to choose 158dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param random random generator to use 159dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @return the initial centers 160dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */ 161dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond private static <T extends Clusterable<T>> List<Cluster<T>> 162dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond chooseInitialCenters(final Collection<T> points, final int k, final Random random) { 163dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 164dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final List<T> pointSet = new ArrayList<T>(points); 165dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final List<Cluster<T>> resultSet = new ArrayList<Cluster<T>>(); 166dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 167dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // Choose one center uniformly at random from among the data points. 168dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final T firstPoint = pointSet.remove(random.nextInt(pointSet.size())); 169dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond resultSet.add(new Cluster<T>(firstPoint)); 170dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 171dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final double[] dx2 = new double[pointSet.size()]; 172dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond while (resultSet.size() < k) { 173dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // For each data point x, compute D(x), the distance between x and 174dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // the nearest center that has already been chosen. 175dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond int sum = 0; 176dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond for (int i = 0; i < pointSet.size(); i++) { 177dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final T p = pointSet.get(i); 178dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final Cluster<T> nearest = getNearestCluster(resultSet, p); 179dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final double d = p.distanceFrom(nearest.getCenter()); 180dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond sum += d * d; 181dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond dx2[i] = sum; 182dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 183dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 184dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // Add one new data point as a center. Each point x is chosen with 185dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // probability proportional to D(x)2 186dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final double r = random.nextDouble() * sum; 187dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond for (int i = 0 ; i < dx2.length; i++) { 188dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (dx2[i] >= r) { 189dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final T p = pointSet.remove(i); 190dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond resultSet.add(new Cluster<T>(p)); 191dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond break; 192dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 193dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 194dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 195dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 196dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond return resultSet; 197dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 198dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 199dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 200dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** 201dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Get a random point from the {@link Cluster} with the largest distance variance. 202dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * 203dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param clusters the {@link Cluster}s to search 204dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @return a random point from the selected cluster 205dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */ 206dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond private T getPointFromLargestVarianceCluster(final Collection<Cluster<T>> clusters) { 207dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 208dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond double maxVariance = Double.NEGATIVE_INFINITY; 209dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond Cluster<T> selected = null; 210dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond for (final Cluster<T> cluster : clusters) { 211dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (!cluster.getPoints().isEmpty()) { 212dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 213dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // compute the distance variance of the current cluster 214dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final T center = cluster.getCenter(); 215dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final Variance stat = new Variance(); 216dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond for (final T point : cluster.getPoints()) { 217dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond stat.increment(point.distanceFrom(center)); 218dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 219dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final double variance = stat.getResult(); 220dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 221dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // select the cluster with the largest variance 222dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (variance > maxVariance) { 223dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond maxVariance = variance; 224dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond selected = cluster; 225dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 226dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 227dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 228dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 229dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 230dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // did we find at least one non-empty cluster ? 231dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (selected == null) { 232dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); 233dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 234dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 235dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // extract a random point from the cluster 236dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final List<T> selectedPoints = selected.getPoints(); 237dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond return selectedPoints.remove(random.nextInt(selectedPoints.size())); 238dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 239dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 240dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 241dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** 242dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Get a random point from the {@link Cluster} with the largest number of points 243dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * 244dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param clusters the {@link Cluster}s to search 245dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @return a random point from the selected cluster 246dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */ 247dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond private T getPointFromLargestNumberCluster(final Collection<Cluster<T>> clusters) { 248dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 249dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond int maxNumber = 0; 250dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond Cluster<T> selected = null; 251dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond for (final Cluster<T> cluster : clusters) { 252dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 253dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // get the number of points of the current cluster 254dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final int number = cluster.getPoints().size(); 255dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 256dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // select the cluster with the largest number of points 257dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (number > maxNumber) { 258dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond maxNumber = number; 259dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond selected = cluster; 260dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 261dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 262dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 263dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 264dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // did we find at least one non-empty cluster ? 265dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (selected == null) { 266dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); 267dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 268dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 269dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // extract a random point from the cluster 270dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final List<T> selectedPoints = selected.getPoints(); 271dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond return selectedPoints.remove(random.nextInt(selectedPoints.size())); 272dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 273dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 274dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 275dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** 276dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Get the point farthest to its cluster center 277dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * 278dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param clusters the {@link Cluster}s to search 279dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @return point farthest to its cluster center 280dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */ 281dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond private T getFarthestPoint(final Collection<Cluster<T>> clusters) { 282dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 283dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond double maxDistance = Double.NEGATIVE_INFINITY; 284dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond Cluster<T> selectedCluster = null; 285dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond int selectedPoint = -1; 286dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond for (final Cluster<T> cluster : clusters) { 287dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 288dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // get the farthest point 289dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final T center = cluster.getCenter(); 290dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final List<T> points = cluster.getPoints(); 291dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond for (int i = 0; i < points.size(); ++i) { 292dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final double distance = points.get(i).distanceFrom(center); 293dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (distance > maxDistance) { 294dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond maxDistance = distance; 295dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond selectedCluster = cluster; 296dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond selectedPoint = i; 297dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 298dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 299dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 300dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 301dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 302dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond // did we find at least one non-empty cluster ? 303dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (selectedCluster == null) { 304dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); 305dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 306dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 307dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond return selectedCluster.getPoints().remove(selectedPoint); 308dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 309dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 310dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 311dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond /** 312dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Returns the nearest {@link Cluster} to the given point 313dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * 314dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param <T> type of the points to cluster 315dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param clusters the {@link Cluster}s to search 316dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param point the point to find the nearest {@link Cluster} for 317dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @return the nearest {@link Cluster} to the given point 318dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */ 319dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond private static <T extends Clusterable<T>> Cluster<T> 320dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond getNearestCluster(final Collection<Cluster<T>> clusters, final T point) { 321dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond double minDistance = Double.MAX_VALUE; 322dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond Cluster<T> minCluster = null; 323dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond for (final Cluster<T> c : clusters) { 324dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond final double distance = point.distanceFrom(c.getCenter()); 325dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond if (distance < minDistance) { 326dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond minDistance = distance; 327dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond minCluster = c; 328dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 329dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 330dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond return minCluster; 331dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond } 332dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond 333dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond} 334