1dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond/*
2dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Licensed to the Apache Software Foundation (ASF) under one or more
3dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * contributor license agreements.  See the NOTICE file distributed with
4dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * this work for additional information regarding copyright ownership.
5dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * The ASF licenses this file to You under the Apache License, Version 2.0
6dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * (the "License"); you may not use this file except in compliance with
7dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * the License.  You may obtain a copy of the License at
8dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond *
9dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond *      http://www.apache.org/licenses/LICENSE-2.0
10dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond *
11dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Unless required by applicable law or agreed to in writing, software
12dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * distributed under the License is distributed on an "AS IS" BASIS,
13dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * See the License for the specific language governing permissions and
15dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * limitations under the License.
16dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */
17dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
18dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondpackage org.apache.commons.math.stat.clustering;
19dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
20dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport java.util.ArrayList;
21dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport java.util.Collection;
22dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport java.util.List;
23dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport java.util.Random;
24dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
25dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport org.apache.commons.math.exception.ConvergenceException;
26dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport org.apache.commons.math.exception.util.LocalizedFormats;
27dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondimport org.apache.commons.math.stat.descriptive.moment.Variance;
28dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
29dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond/**
30dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * Clustering algorithm based on David Arthur and Sergei Vassilvitski k-means++ algorithm.
31dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @param <T> type of the points to cluster
32dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @see <a href="http://en.wikipedia.org/wiki/K-means%2B%2B">K-means++ (wikipedia)</a>
33dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @version $Revision: 1054333 $ $Date: 2011-01-02 01:34:58 +0100 (dim. 02 janv. 2011) $
34dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond * @since 2.0
35dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond */
36dee0849a9704d532af0b550146cbafbaa6ee1d19Raymondpublic class KMeansPlusPlusClusterer<T extends Clusterable<T>> {
37dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
38dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /** Strategies to use for replacing an empty cluster. */
39dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    public static enum EmptyClusterStrategy {
40dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
41dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        /** Split the cluster with largest distance variance. */
42dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        LARGEST_VARIANCE,
43dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
44dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        /** Split the cluster with largest number of points. */
45dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        LARGEST_POINTS_NUMBER,
46dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
47dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        /** Create a cluster around the point farthest from its centroid. */
48dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        FARTHEST_POINT,
49dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
50dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        /** Generate an error. */
51dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        ERROR
52dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
53dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    }
54dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
55dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /** Random generator for choosing initial centers. */
56dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    private final Random random;
57dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
58dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /** Selected strategy for empty clusters. */
59dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    private final EmptyClusterStrategy emptyStrategy;
60dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
61dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /** Build a clusterer.
62dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * <p>
63dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * The default strategy for handling empty clusters that may appear during
64dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * algorithm iterations is to split the cluster with largest distance variance.
65dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * </p>
66dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param random random generator to use for choosing initial centers
67dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     */
68dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    public KMeansPlusPlusClusterer(final Random random) {
69dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        this(random, EmptyClusterStrategy.LARGEST_VARIANCE);
70dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    }
71dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
72dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /** Build a clusterer.
73dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param random random generator to use for choosing initial centers
74dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param emptyStrategy strategy to use for handling empty clusters that
75dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * may appear during algorithm iterations
76dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @since 2.2
77dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     */
78dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    public KMeansPlusPlusClusterer(final Random random, final EmptyClusterStrategy emptyStrategy) {
79dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        this.random        = random;
80dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        this.emptyStrategy = emptyStrategy;
81dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    }
82dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
83dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /**
84dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * Runs the K-means++ clustering algorithm.
85dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     *
86dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param points the points to cluster
87dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param k the number of clusters to split the data into
88dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param maxIterations the maximum number of iterations to run the algorithm
89dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     *     for.  If negative, no maximum will be used
90dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @return a list of clusters containing the points
91dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     */
92dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    public List<Cluster<T>> cluster(final Collection<T> points,
93dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                                    final int k, final int maxIterations) {
94dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        // create the initial clusters
95dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        List<Cluster<T>> clusters = chooseInitialCenters(points, k, random);
96dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        assignPointsToClusters(clusters, points);
97dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
98dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        // iterate through updating the centers until we're done
99dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        final int max = (maxIterations < 0) ? Integer.MAX_VALUE : maxIterations;
100dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        for (int count = 0; count < max; count++) {
101dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            boolean clusteringChanged = false;
102dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            List<Cluster<T>> newClusters = new ArrayList<Cluster<T>>();
103dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            for (final Cluster<T> cluster : clusters) {
104dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                final T newCenter;
105dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                if (cluster.getPoints().isEmpty()) {
106dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    switch (emptyStrategy) {
107dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                        case LARGEST_VARIANCE :
108dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                            newCenter = getPointFromLargestVarianceCluster(clusters);
109dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                            break;
110dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                        case LARGEST_POINTS_NUMBER :
111dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                            newCenter = getPointFromLargestNumberCluster(clusters);
112dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                            break;
113dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                        case FARTHEST_POINT :
114dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                            newCenter = getFarthestPoint(clusters);
115dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                            break;
116dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                        default :
117dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                            throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS);
118dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    }
119dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    clusteringChanged = true;
120dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                } else {
121dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    newCenter = cluster.getCenter().centroidOf(cluster.getPoints());
122dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    if (!newCenter.equals(cluster.getCenter())) {
123dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                        clusteringChanged = true;
124dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    }
125dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                }
126dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                newClusters.add(new Cluster<T>(newCenter));
127dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            }
128dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            if (!clusteringChanged) {
129dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                return clusters;
130dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            }
131dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            assignPointsToClusters(newClusters, points);
132dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            clusters = newClusters;
133dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        }
134dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        return clusters;
135dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    }
136dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
137dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /**
138dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * Adds the given points to the closest {@link Cluster}.
139dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     *
140dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param <T> type of the points to cluster
141dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param clusters the {@link Cluster}s to add the points to
142dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param points the points to add to the given {@link Cluster}s
143dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     */
144dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    private static <T extends Clusterable<T>> void
145dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        assignPointsToClusters(final Collection<Cluster<T>> clusters, final Collection<T> points) {
146dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        for (final T p : points) {
147dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            Cluster<T> cluster = getNearestCluster(clusters, p);
148dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            cluster.addPoint(p);
149dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        }
150dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    }
151dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
152dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /**
153dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * Use K-means++ to choose the initial centers.
154dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     *
155dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param <T> type of the points to cluster
156dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param points the points to choose the initial centers from
157dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param k the number of centers to choose
158dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param random random generator to use
159dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @return the initial centers
160dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     */
161dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    private static <T extends Clusterable<T>> List<Cluster<T>>
162dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        chooseInitialCenters(final Collection<T> points, final int k, final Random random) {
163dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
164dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        final List<T> pointSet = new ArrayList<T>(points);
165dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        final List<Cluster<T>> resultSet = new ArrayList<Cluster<T>>();
166dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
167dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        // Choose one center uniformly at random from among the data points.
168dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        final T firstPoint = pointSet.remove(random.nextInt(pointSet.size()));
169dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        resultSet.add(new Cluster<T>(firstPoint));
170dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
171dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        final double[] dx2 = new double[pointSet.size()];
172dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        while (resultSet.size() < k) {
173dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            // For each data point x, compute D(x), the distance between x and
174dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            // the nearest center that has already been chosen.
175dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            int sum = 0;
176dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            for (int i = 0; i < pointSet.size(); i++) {
177dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                final T p = pointSet.get(i);
178dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                final Cluster<T> nearest = getNearestCluster(resultSet, p);
179dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                final double d = p.distanceFrom(nearest.getCenter());
180dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                sum += d * d;
181dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                dx2[i] = sum;
182dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            }
183dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
184dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            // Add one new data point as a center. Each point x is chosen with
185dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            // probability proportional to D(x)2
186dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            final double r = random.nextDouble() * sum;
187dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            for (int i = 0 ; i < dx2.length; i++) {
188dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                if (dx2[i] >= r) {
189dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    final T p = pointSet.remove(i);
190dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    resultSet.add(new Cluster<T>(p));
191dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    break;
192dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                }
193dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            }
194dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        }
195dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
196dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        return resultSet;
197dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
198dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    }
199dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
200dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /**
201dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * Get a random point from the {@link Cluster} with the largest distance variance.
202dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     *
203dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param clusters the {@link Cluster}s to search
204dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @return a random point from the selected cluster
205dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     */
206dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    private T getPointFromLargestVarianceCluster(final Collection<Cluster<T>> clusters) {
207dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
208dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        double maxVariance = Double.NEGATIVE_INFINITY;
209dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        Cluster<T> selected = null;
210dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        for (final Cluster<T> cluster : clusters) {
211dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            if (!cluster.getPoints().isEmpty()) {
212dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
213dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                // compute the distance variance of the current cluster
214dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                final T center = cluster.getCenter();
215dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                final Variance stat = new Variance();
216dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                for (final T point : cluster.getPoints()) {
217dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    stat.increment(point.distanceFrom(center));
218dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                }
219dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                final double variance = stat.getResult();
220dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
221dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                // select the cluster with the largest variance
222dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                if (variance > maxVariance) {
223dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    maxVariance = variance;
224dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    selected = cluster;
225dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                }
226dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
227dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            }
228dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        }
229dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
230dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        // did we find at least one non-empty cluster ?
231dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        if (selected == null) {
232dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS);
233dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        }
234dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
235dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        // extract a random point from the cluster
236dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        final List<T> selectedPoints = selected.getPoints();
237dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        return selectedPoints.remove(random.nextInt(selectedPoints.size()));
238dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
239dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    }
240dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
241dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /**
242dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * Get a random point from the {@link Cluster} with the largest number of points
243dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     *
244dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param clusters the {@link Cluster}s to search
245dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @return a random point from the selected cluster
246dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     */
247dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    private T getPointFromLargestNumberCluster(final Collection<Cluster<T>> clusters) {
248dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
249dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        int maxNumber = 0;
250dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        Cluster<T> selected = null;
251dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        for (final Cluster<T> cluster : clusters) {
252dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
253dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            // get the number of points of the current cluster
254dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            final int number = cluster.getPoints().size();
255dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
256dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            // select the cluster with the largest number of points
257dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            if (number > maxNumber) {
258dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                maxNumber = number;
259dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                selected = cluster;
260dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            }
261dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
262dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        }
263dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
264dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        // did we find at least one non-empty cluster ?
265dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        if (selected == null) {
266dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS);
267dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        }
268dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
269dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        // extract a random point from the cluster
270dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        final List<T> selectedPoints = selected.getPoints();
271dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        return selectedPoints.remove(random.nextInt(selectedPoints.size()));
272dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
273dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    }
274dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
275dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /**
276dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * Get the point farthest to its cluster center
277dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     *
278dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param clusters the {@link Cluster}s to search
279dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @return point farthest to its cluster center
280dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     */
281dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    private T getFarthestPoint(final Collection<Cluster<T>> clusters) {
282dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
283dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        double maxDistance = Double.NEGATIVE_INFINITY;
284dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        Cluster<T> selectedCluster = null;
285dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        int selectedPoint = -1;
286dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        for (final Cluster<T> cluster : clusters) {
287dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
288dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            // get the farthest point
289dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            final T center = cluster.getCenter();
290dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            final List<T> points = cluster.getPoints();
291dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            for (int i = 0; i < points.size(); ++i) {
292dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                final double distance = points.get(i).distanceFrom(center);
293dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                if (distance > maxDistance) {
294dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    maxDistance     = distance;
295dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    selectedCluster = cluster;
296dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                    selectedPoint   = i;
297dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                }
298dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            }
299dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
300dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        }
301dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
302dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        // did we find at least one non-empty cluster ?
303dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        if (selectedCluster == null) {
304dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS);
305dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        }
306dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
307dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        return selectedCluster.getPoints().remove(selectedPoint);
308dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
309dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    }
310dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
311dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    /**
312dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * Returns the nearest {@link Cluster} to the given point
313dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     *
314dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param <T> type of the points to cluster
315dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param clusters the {@link Cluster}s to search
316dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @param point the point to find the nearest {@link Cluster} for
317dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     * @return the nearest {@link Cluster} to the given point
318dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond     */
319dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    private static <T extends Clusterable<T>> Cluster<T>
320dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        getNearestCluster(final Collection<Cluster<T>> clusters, final T point) {
321dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        double minDistance = Double.MAX_VALUE;
322dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        Cluster<T> minCluster = null;
323dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        for (final Cluster<T> c : clusters) {
324dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            final double distance = point.distanceFrom(c.getCenter());
325dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            if (distance < minDistance) {
326dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                minDistance = distance;
327dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond                minCluster = c;
328dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond            }
329dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        }
330dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond        return minCluster;
331dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond    }
332dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond
333dee0849a9704d532af0b550146cbafbaa6ee1d19Raymond}
334