1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef KMEANS_H
18#define KMEANS_H
19
20#include <cstdlib>
21#include <math.h>
22
23// Helper functions
24
25template <typename T, typename N>
26inline void sum(T values[], int len, int dimension, int stride, N dst[]) {
27    int x, y;
28    // zero out dst vector
29    for (x = 0; x < dimension; x++) {
30        dst[x] = 0;
31    }
32    for (x = 0; x < len; x+= stride) {
33        for (y = 0; y < dimension; y++) {
34            dst[y] += values[x + y];
35        }
36    }
37}
38
39template <typename T, typename N>
40inline void set(T val1[], N val2[], int dimension) {
41    int x;
42    for (x = 0; x < dimension; x++) {
43        val1[x] = val2[x];
44    }
45}
46
47template <typename T, typename N>
48inline void add(T val[], N dst[], int dimension) {
49    int x;
50    for (x = 0; x < dimension; x++) {
51        dst[x] += val[x];
52    }
53}
54
55template <typename T, typename N>
56inline void divide(T dst[], N divisor, int dimension) {
57   int x;
58   if (divisor == 0) {
59       return;
60   }
61   for (x = 0; x < dimension; x++) {
62       dst[x] /= divisor;
63   }
64}
65
66/**
67 * Calculates euclidean distance.
68 */
69
70template <typename T, typename N>
71inline N euclideanDist(T val1[], T val2[], int dimension) {
72    int x;
73    N sum = 0;
74    for (x = 0; x < dimension; x++) {
75        N diff = (N) val1[x] - (N) val2[x];
76        sum += diff * diff;
77    }
78    return sqrt(sum);
79}
80
81// K-Means
82
83
84/**
85 * Picks k random starting points from the data set.
86 */
87template <typename T>
88void initialPickHeuristicRandom(int k, T values[], int len, int dimension, int stride, T dst[],
89        unsigned int seed) {
90    int x, z, num_vals, cntr;
91    num_vals = len / stride;
92    cntr = 0;
93    srand(seed);
94    unsigned int r_vals[k];
95    unsigned int r;
96
97    for (x = 0; x < k; x++) {
98
99        // ensure randomly chosen value is unique
100        int r_check = 0;
101        while (r_check == 0) {
102            r = (unsigned int) rand() % num_vals;
103            r_check = 1;
104            for (z = 0; z < x; z++) {
105                if (r == r_vals[z]) {
106                    r_check = 0;
107                }
108            }
109        }
110        r_vals[x] = r;
111        r *= stride;
112
113        // set dst to be randomly chosen value
114        set<T,T>(dst + cntr, values + r, dimension);
115        cntr += stride;
116    }
117}
118
119/**
120 * Finds index of closet centroid to a value
121 */
122template <typename T, typename N>
123inline int findClosest(T values[], T oldCenters[], int dimension, int stride, int pop_size) {
124    int best_ind = 0;
125    N best_len = euclideanDist <T, N>(values, oldCenters, dimension);
126    int y;
127    for (y = stride; y < pop_size; y+=stride) {
128        N l = euclideanDist <T, N>(values, oldCenters + y, dimension);
129        if (l < best_len) {
130            best_len = l;
131            best_ind = y;
132        }
133    }
134    return best_ind;
135}
136
137/**
138 * Calculates new centroids by averaging value clusters for old centroids.
139 */
140template <typename T, typename N>
141int calculateNewCentroids(int k, T values[], int len, int dimension, int stride, T oldCenters[],
142        T dst[]) {
143    int x, pop_size;
144    pop_size = k * stride;
145    int popularities[k];
146    N tmp[pop_size];
147
148    //zero popularities
149    memset(popularities, 0, sizeof(int) * k);
150    // zero dst, and tmp
151    for (x = 0; x < pop_size; x++) {
152        tmp[x] = 0;
153    }
154
155    // put summation for each k in tmp
156    for (x = 0; x < len; x+=stride) {
157        int best = findClosest<T, N>(values + x, oldCenters, dimension, stride, pop_size);
158        add<T, N>(values + x, tmp + best, dimension);
159        popularities[best / stride]++;
160
161    }
162
163    int ret = 0;
164    int y;
165    // divide to get centroid and set dst to result
166    for (x = 0; x < pop_size; x+=stride) {
167        divide<N, int>(tmp + x, popularities[x / stride], dimension);
168        for (y = 0; y < dimension; y++) {
169            if ((dst + x)[y] != (T) ((tmp + x)[y])) {
170                ret = 1;
171            }
172        }
173        set(dst + x, tmp + x, dimension);
174    }
175    return ret;
176}
177
178template <typename T, typename N>
179void runKMeansWithPicks(int k, T finalCentroids[], T values[], int len, int dimension, int stride,
180        int iterations, T initialPicks[]){
181        int k_len = k * stride;
182        int x;
183
184        // zero newCenters
185        for (x = 0; x < k_len; x++) {
186            finalCentroids[x] = 0;
187        }
188
189        T * c1 = initialPicks;
190        T * c2 = finalCentroids;
191        T * temp;
192        int ret = 1;
193        for (x = 0; x < iterations; x++) {
194            ret = calculateNewCentroids<T, N>(k, values, len, dimension, stride, c1, c2);
195            temp = c1;
196            c1 = c2;
197            c2 = temp;
198            if (ret == 0) {
199                x = iterations;
200            }
201        }
202        set<T, T>(finalCentroids, c1, dimension);
203}
204
205/**
206 * Runs the k-means algorithm on dataset values with some initial centroids.
207 */
208template <typename T, typename N>
209void runKMeans(int k, T finalCentroids[], T values[], int len, int dimension, int stride,
210        int iterations, unsigned int seed){
211    int k_len = k * stride;
212    T initialPicks [k_len];
213    initialPickHeuristicRandom<T>(k, values, len, dimension, stride, initialPicks, seed);
214
215    runKMeansWithPicks<T, N>(k, finalCentroids, values, len, dimension, stride,
216        iterations, initialPicks);
217}
218
219/**
220 * Sets each value in values to the closest centroid.
221 */
222template <typename T, typename N>
223void applyCentroids(int k, T centroids[], T values[], int len, int dimension, int stride) {
224    int x, pop_size;
225    pop_size = k * stride;
226    for (x = 0; x < len; x+= stride) {
227        int best = findClosest<T, N>(values + x, centroids, dimension, stride, pop_size);
228        set<T, T>(values + x, centroids + best, dimension);
229    }
230}
231
232#endif // KMEANS_H
233