1 /**************************************************************************
2 * Copyright(c) 1998-1999, ALICE Experiment at CERN, All rights reserved. *
4 * Author: The ALICE Off-line Project. *
5 * Contributors are mentioned in the code where appropriate. *
7 * Permission to use, copy, modify and distribute this software and its *
8 * documentation strictly for non-commercial purposes is hereby granted *
9 * without fee, provided that the above copyright notice appears in all *
10 * copies and that both the copyright notice and this permission notice *
11 * appear in the supporting documentation. The authors make no claims *
12 * about the suitability of this software for any purpose. It is *
13 * provided "as is" without express or implied warranty. *
14 **************************************************************************/
16 // Implemenatation of the K-Means Clustering Algorithm
17 // See http://en.wikipedia.org/wiki/K-means_clustering and references therein.
19 // This particular implementation is the so called Soft K-means algorithm.
20 // It has been modified to work on the cylindrical topology in eta-phi space.
22 // Author: Andreas Morsch (CERN)
23 // andreas.morsch@cern.ch
25 #include "AliKMeansClustering.h"
30 ClassImp(AliKMeansClustering)
32 Double_t AliKMeansClustering::fBeta = 10.;
35 Int_t AliKMeansClustering::SoftKMeans(Int_t k, Int_t n, Double_t* x, Double_t* y, Double_t* mx, Double_t* my , Double_t* rk )
38 // The soft K-means algorithm
42 // (1) Initialisation of the k means
44 for (i = 0; i < k; i++) {
45 mx[i] = 2. * TMath::Pi() * gRandom->Rndm();
46 my[i] = -1. + 2. * gRandom->Rndm();
50 // (2a) The responsibilities
51 Double_t** r = new Double_t*[n]; // responsibilities
52 for (j = 0; j < n; j++) {r[j] = new Double_t[k];}
55 Double_t* nr = new Double_t[n];
64 for (j = 0; j < n; j++) {
66 for (i = 0; i < k; i++) {
67 r[j][i] = TMath::Exp(- fBeta * d(mx[i], my[i], x[j], y[j]));
72 for (j = 0; j < n; j++) {
73 for (i = 0; i < k; i++) {
82 for (i = 0; i < k; i++) {
83 Double_t oldx = mx[i];
84 Double_t oldy = my[i];
90 for (j = 1; j < n; j++) {
93 // Here we have to take into acount the cylinder topology where phi is defined mod 2xpi
94 // If two coordinates are separated by more than pi in phi one has to be shifted by +/- 2 pi
96 Double_t dx = mx[i] - x[j];
97 if (dx > TMath::Pi()) xx += 2. * TMath::Pi();
98 if (dx < -TMath::Pi()) xx -= 2. * TMath::Pi();
99 mx[i] = mx[i] * rk[i] + r[j][i] * xx;
100 my[i] = my[i] * rk[i] + r[j][i] * y[j];
104 if (mx[i] > 2. * TMath::Pi()) mx[i] -= 2. * TMath::Pi();
105 if (mx[i] < 0. ) mx[i] += 2. * TMath::Pi();
107 di += d(mx[i], my[i], oldx, oldy);
111 if (di < 1.e-8 || nit > 1000) break;
116 for (j = 0; j < n; j++) delete[] r[j];
123 Int_t AliKMeansClustering::SoftKMeans2(Int_t k, Int_t n, Double_t* x, Double_t* y, Double_t* mx, Double_t* my , Double_t* sigma2, Double_t* rk )
126 // The soft K-means algorithm
130 // (1) Initialisation of the k means using k-means++ recipe
132 OptimalInit(k, n, x, y, mx, my);
134 // (2a) The responsibilities
135 Double_t** r = new Double_t*[n]; // responsibilities
136 for (j = 0; j < n; j++) {r[j] = new Double_t[k];}
138 // (2b) Normalisation
139 Double_t* nr = new Double_t[n];
142 Double_t* pi = new Double_t[k];
145 // (2d) Initialise the responsibilties and weights
146 for (j = 0; j < n; j++) {
148 for (i = 0; i < k; i++) {
149 r[j][i] = TMath::Exp(- fBeta * d(mx[i], my[i], x[j], y[j]));
154 for (i = 0; i < k; i++) {
156 sigma2[i] = 1./fBeta;
158 for (j = 0; j < n; j++) {
162 pi[i] = rk[i] / Double_t(n);
166 Bool_t rmovalStep = kFALSE;
173 for (j = 0; j < n; j++) {
175 for (i = 0; i < k; i++) {
176 r[j][i] = pi[i] * TMath::Exp(- d(mx[i], my[i], x[j], y[j]) / sigma2[i] )
177 / (2. * sigma2[i] * TMath::Pi() * TMath::Pi());
182 for (i = 0; i < k; i++) {
183 for (j = 0; j < n; j++) {
192 for (i = 0; i < k; i++) {
193 Double_t oldx = mx[i];
194 Double_t oldy = my[i];
199 for (j = 1; j < n; j++) {
202 // Here we have to take into acount the cylinder topology where phi is defined mod 2xpi
203 // If two coordinates are separated by more than pi in phi one has to be shifted by +/- 2 pi
205 Double_t dx = mx[i] - x[j];
206 if (dx > TMath::Pi()) xx += 2. * TMath::Pi();
207 if (dx < -TMath::Pi()) xx -= 2. * TMath::Pi();
208 if (r[j][i] > 1.e-15) {
209 mx[i] = mx[i] * rk[i] + r[j][i] * xx;
210 my[i] = my[i] * rk[i] + r[j][i] * y[j];
215 if (mx[i] > 2. * TMath::Pi()) mx[i] -= 2. * TMath::Pi();
216 if (mx[i] < 0. ) mx[i] += 2. * TMath::Pi();
218 di += d(mx[i], my[i], oldx, oldy);
223 for (i = 0; i < k; i++) {
225 for (j = 1; j < n; j++) {
226 sigma2[i] += r[j][i] * d(mx[i], my[i], x[j], y[j]);
229 if (sigma2[i] < 0.0025) sigma2[i] = 0.0025;
233 for (i = 0; i < k; i++) pi[i] = rk[i] / Double_t(n);
236 if (di < 1.e-8 || nit > 1000) break;
242 for (j = 0; j < n; j++) delete[] r[j];
248 Int_t AliKMeansClustering::SoftKMeans3(Int_t k, Int_t n, Double_t* x, Double_t* y, Double_t* mx, Double_t* my ,
249 Double_t* sigmax2, Double_t* sigmay2, Double_t* rk )
252 // The soft K-means algorithm
256 // (1) Initialisation of the k means using k-means++ recipe
258 OptimalInit(k, n, x, y, mx, my);
260 // (2a) The responsibilities
261 Double_t** r = new Double_t*[n]; // responsibilities
262 for (j = 0; j < n; j++) {r[j] = new Double_t[k];}
264 // (2b) Normalisation
265 Double_t* nr = new Double_t[n];
268 Double_t* pi = new Double_t[k];
271 // (2d) Initialise the responsibilties and weights
272 for (j = 0; j < n; j++) {
274 for (i = 0; i < k; i++) {
276 r[j][i] = TMath::Exp(- fBeta * d(mx[i], my[i], x[j], y[j]));
281 for (i = 0; i < k; i++) {
283 sigmax2[i] = 1./fBeta;
284 sigmay2[i] = 1./fBeta;
286 for (j = 0; j < n; j++) {
290 pi[i] = rk[i] / Double_t(n);
294 Bool_t rmovalStep = kFALSE;
301 for (j = 0; j < n; j++) {
303 for (i = 0; i < k; i++) {
305 Double_t dx = TMath::Abs(mx[i]-x[j]);
306 if (dx > TMath::Pi()) dx = 2. * TMath::Pi() - dx;
307 Double_t dy = TMath::Abs(my[i]-y[j]);
308 r[j][i] = pi[i] * TMath::Exp(-0.5 * (dx * dx / sigmax2[i] + dy * dy / sigmay2[i]))
309 / (2. * TMath::Sqrt(sigmax2[i] * sigmay2[i]) * TMath::Pi() * TMath::Pi());
314 for (i = 0; i < k; i++) {
315 for (j = 0; j < n; j++) {
324 for (i = 0; i < k; i++) {
325 Double_t oldx = mx[i];
326 Double_t oldy = my[i];
331 for (j = 1; j < n; j++) {
334 // Here we have to take into acount the cylinder topology where phi is defined mod 2xpi
335 // If two coordinates are separated by more than pi in phi one has to be shifted by +/- 2 pi
337 Double_t dx = mx[i] - x[j];
338 if (dx > TMath::Pi()) xx += 2. * TMath::Pi();
339 if (dx < -TMath::Pi()) xx -= 2. * TMath::Pi();
340 if (r[j][i] > 1.e-15) {
341 mx[i] = mx[i] * rk[i] + r[j][i] * xx;
342 my[i] = my[i] * rk[i] + r[j][i] * y[j];
347 if (mx[i] > 2. * TMath::Pi()) mx[i] -= 2. * TMath::Pi();
348 if (mx[i] < 0. ) mx[i] += 2. * TMath::Pi();
350 di += d(mx[i], my[i], oldx, oldy);
355 for (i = 0; i < k; i++) {
359 for (j = 1; j < n; j++) {
360 Double_t dx = TMath::Abs(mx[i]-x[j]);
361 if (dx > TMath::Pi()) dx = 2. * TMath::Pi() - dx;
362 Double_t dy = TMath::Abs(my[i]-y[j]);
363 sigmax2[i] += r[j][i] * dx * dx;
364 sigmay2[i] += r[j][i] * dy * dy;
368 if (sigmax2[i] < 0.0025) sigmax2[i] = 0.0025;
369 if (sigmay2[i] < 0.0025) sigmay2[i] = 0.0025;
373 for (i = 0; i < k; i++) pi[i] = rk[i] / Double_t(n);
376 if (di < 1.e-8 || nit > 1000) break;
382 for (j = 0; j < n; j++) delete[] r[j];
388 Double_t AliKMeansClustering::d(Double_t mx, Double_t my, Double_t x, Double_t y)
391 // Distance definition
392 // Quasi - Euclidian on the eta-phi cylinder
394 Double_t dx = TMath::Abs(mx-x);
395 if (dx > TMath::Pi()) dx = 2. * TMath::Pi() - dx;
397 return (0.5*(dx * dx + (my - y) * (my - y)));
402 void AliKMeansClustering::OptimalInit(Int_t k, Int_t n, Double_t* x, Double_t* y, Double_t* mx, Double_t* my)
405 // Optimal initialisation using the k-means++ algorithm
406 // http://en.wikipedia.org/wiki/K-means%2B%2B
408 // k-means++ is an algorithm for choosing the initial values for k-means clustering in statistics and machine learning.
409 // It was proposed in 2007 by David Arthur and Sergei Vassilvitskii as an approximation algorithm for the NP-hard k-means problem---
410 // a way of avoiding the sometimes poor clusterings found by the standard k-means algorithm.
413 TH1F d2("d2", "", n, -0.5, Float_t(n)-0.5);
416 // (1) Chose first center as a random point among the input data.
417 Int_t ir = Int_t(Float_t(n) * gRandom->Rndm());
425 // find min distance to existing clusters
426 for (Int_t j = 0; j < n; j++) {
427 Double_t dmin = 1.e10;
428 for (Int_t i = 0; i < icl; i++) {
429 Double_t dij = d(mx[i], my[i], x[j], y[j]);
430 if (dij < dmin) dmin = dij;
432 d2.Fill(Float_t(j), dmin);
434 // select a new cluster from data points with probability ~d2
435 ir = Int_t(d2.GetRandom() + 0.5);
443 ClassImp(AliKMeansResult)
447 AliKMeansResult::AliKMeansResult(Int_t k):
450 fMx (new Double_t[k]),
451 fMy (new Double_t[k]),
452 fSigma2(new Double_t[k]),
453 fRk (new Double_t[k]),
454 fTarget(new Double_t[k]),
460 AliKMeansResult::AliKMeansResult(const AliKMeansResult &res):
471 for (Int_t i = 0; i <fK; i++) {
472 fMx[i] = (res.GetMx()) [i];
473 fMy[i] = (res.GetMy()) [i];
474 fSigma2[i] = (res.GetSigma2())[i];
475 fRk[i] = (res.GetRk()) [i];
476 fTarget[i] = (res.GetTarget())[i];
477 fInd[i] = (res.GetInd()) [i];
481 AliKMeansResult& AliKMeansResult::operator=(const AliKMeansResult& res)
484 // Assignment operator
487 for (Int_t i = 0; i <fK; i++) {
488 fMx[i] = (res.GetMx()) [i];
489 fMy[i] = (res.GetMy()) [i];
490 fSigma2[i] = (res.GetSigma2())[i];
491 fRk[i] = (res.GetRk()) [i];
492 fTarget[i] = (res.GetTarget())[i];
493 fInd[i] = (res.GetInd()) [i];
500 AliKMeansResult::~AliKMeansResult()
511 void AliKMeansResult::Sort()
514 for (Int_t i = 0; i < fK; i++) {
515 if (fRk[i] > 1.) fRk[i] /= fSigma2[i];
519 TMath::Sort(fK, fRk, fInd);
523 void AliKMeansResult::Sort(Int_t n, Double_t* x, Double_t* y)
525 // Build target array and sort
526 for (Int_t i = 0; i < fK; i++)
529 for (Int_t j = 0; j < n; j++)
531 if (2. * AliKMeansClustering::d(fMx[i], fMy[i], x[j], y[j]) < fSigma2[i]) nc++;
534 fTarget[i] = Double_t(nc) / fSigma2[i];
540 TMath::Sort(fK, fTarget, fInd);
543 void AliKMeansResult::CopyResults(AliKMeansResult* res)
546 for (Int_t i = 0; i <fK; i++) {
547 fMx[i] = (res->GetMx()) [i];
548 fMy[i] = (res->GetMy()) [i];
549 fSigma2[i] = (res->GetSigma2())[i];
550 fRk[i] = (res->GetRk()) [i];
551 fTarget[i] = (res->GetTarget())[i];
552 fInd[i] = (res->GetInd()) [i];