]> git.uio.no Git - u/mrichter/AliRoot.git/blame_incremental - JETAN/AliKMeansClustering.cxx
adding cross check histograms
[u/mrichter/AliRoot.git] / JETAN / AliKMeansClustering.cxx
... / ...
CommitLineData
1/**************************************************************************
2 * Copyright(c) 1998-1999, ALICE Experiment at CERN, All rights reserved. *
3 * *
4 * Author: The ALICE Off-line Project. *
5 * Contributors are mentioned in the code where appropriate. *
6 * *
7 * Permission to use, copy, modify and distribute this software and its *
8 * documentation strictly for non-commercial purposes is hereby granted *
9 * without fee, provided that the above copyright notice appears in all *
10 * copies and that both the copyright notice and this permission notice *
11 * appear in the supporting documentation. The authors make no claims *
12 * about the suitability of this software for any purpose. It is *
13 * provided "as is" without express or implied warranty. *
14 **************************************************************************/
15
16// Implemenatation of the K-Means Clustering Algorithm
17// See http://en.wikipedia.org/wiki/K-means_clustering and references therein.
18//
19// This particular implementation is the so called Soft K-means algorithm.
20// It has been modified to work on the cylindrical topology in eta-phi space.
21//
22// Author: Andreas Morsch (CERN)
23// andreas.morsch@cern.ch
24
25#include "AliKMeansClustering.h"
26#include <TMath.h>
27#include <TRandom.h>
28#include <TH1F.h>
29
30ClassImp(AliKMeansClustering)
31
32Double_t AliKMeansClustering::fBeta = 10.;
33
34
35Int_t AliKMeansClustering::SoftKMeans(Int_t k, Int_t n, const Double_t* x, const Double_t* y, Double_t* mx, Double_t* my , Double_t* rk )
36{
37 //
38 // The soft K-means algorithm
39 //
40 Int_t i,j;
41 //
42 // (1) Initialisation of the k means
43
44 for (i = 0; i < k; i++) {
45 mx[i] = 2. * TMath::Pi() * gRandom->Rndm();
46 my[i] = -1. + 2. * gRandom->Rndm();
47 }
48
49 //
50 // (2a) The responsibilities
51 Double_t** r = new Double_t*[n]; // responsibilities
52 for (j = 0; j < n; j++) {r[j] = new Double_t[k];}
53 //
54 // (2b) Normalisation
55 Double_t* nr = new Double_t[n];
56 // (3) Iterations
57 Int_t nit = 0;
58
59 while(1) {
60 nit++;
61 //
62 // Assignment step
63 //
64 for (j = 0; j < n; j++) {
65 nr[j] = 0.;
66 for (i = 0; i < k; i++) {
67 r[j][i] = TMath::Exp(- fBeta * d(mx[i], my[i], x[j], y[j]));
68 nr[j] += r[j][i];
69 } // mean i
70 } // data point j
71
72 for (j = 0; j < n; j++) {
73 for (i = 0; i < k; i++) {
74 r[j][i] /= nr[j];
75 } // mean i
76 } // data point j
77
78 //
79 // Update step
80 Double_t di = 0;
81
82 for (i = 0; i < k; i++) {
83 Double_t oldx = mx[i];
84 Double_t oldy = my[i];
85
86 mx[i] = x[0];
87 my[i] = y[0];
88 rk[i] = r[0][i];
89
90 for (j = 1; j < n; j++) {
91 Double_t xx = x[j];
92//
93// Here we have to take into acount the cylinder topology where phi is defined mod 2xpi
94// If two coordinates are separated by more than pi in phi one has to be shifted by +/- 2 pi
95
96 Double_t dx = mx[i] - x[j];
97 if (dx > TMath::Pi()) xx += 2. * TMath::Pi();
98 if (dx < -TMath::Pi()) xx -= 2. * TMath::Pi();
99 mx[i] = mx[i] * rk[i] + r[j][i] * xx;
100 my[i] = my[i] * rk[i] + r[j][i] * y[j];
101 rk[i] += r[j][i];
102 mx[i] /= rk[i];
103 my[i] /= rk[i];
104 if (mx[i] > 2. * TMath::Pi()) mx[i] -= 2. * TMath::Pi();
105 if (mx[i] < 0. ) mx[i] += 2. * TMath::Pi();
106 } // Data
107 di += d(mx[i], my[i], oldx, oldy);
108 } // means
109 //
110 // ending condition
111 if (di < 1.e-8 || nit > 1000) break;
112 } // while
113
114// Clean-up
115 delete[] nr;
116 for (j = 0; j < n; j++) delete[] r[j];
117 delete[] r;
118//
119 return (nit < 1000);
120
121}
122
123Int_t AliKMeansClustering::SoftKMeans2(Int_t k, Int_t n, Double_t* x, Double_t* y, Double_t* mx, Double_t* my , Double_t* sigma2, Double_t* rk )
124{
125 //
126 // The soft K-means algorithm
127 //
128 Int_t i,j;
129 //
130 // (1) Initialisation of the k means using k-means++ recipe
131 //
132 OptimalInit(k, n, x, y, mx, my);
133 //
134 // (2a) The responsibilities
135 Double_t** r = new Double_t*[n]; // responsibilities
136 for (j = 0; j < n; j++) {r[j] = new Double_t[k];}
137 //
138 // (2b) Normalisation
139 Double_t* nr = new Double_t[n];
140 //
141 // (2c) Weights
142 Double_t* pi = new Double_t[k];
143 //
144 //
145 // (2d) Initialise the responsibilties and weights
146 for (j = 0; j < n; j++) {
147 nr[j] = 0.;
148 for (i = 0; i < k; i++) {
149 r[j][i] = TMath::Exp(- fBeta * d(mx[i], my[i], x[j], y[j]));
150 nr[j] += r[j][i];
151 } // mean i
152 } // data point j
153
154 for (i = 0; i < k; i++) {
155 rk[i] = 0.;
156 sigma2[i] = 1./fBeta;
157
158 for (j = 0; j < n; j++) {
159 r[j][i] /= nr[j];
160 rk[i] += r[j][i];
161 } // mean i
162 pi[i] = rk[i] / Double_t(n);
163 } // data point j
164 // (3) Iterations
165 Int_t nit = 0;
166
167 while(1) {
168 nit++;
169 //
170 // Assignment step
171 //
172 for (j = 0; j < n; j++) {
173 nr[j] = 0.;
174 for (i = 0; i < k; i++) {
175 r[j][i] = pi[i] * TMath::Exp(- d(mx[i], my[i], x[j], y[j]) / sigma2[i] )
176 / (2. * sigma2[i] * TMath::Pi() * TMath::Pi());
177 nr[j] += r[j][i];
178 } // mean i
179 } // data point j
180
181 for (i = 0; i < k; i++) {
182 for (j = 0; j < n; j++) {
183 r[j][i] /= nr[j];
184 } // mean i
185 } // data point j
186
187 //
188 // Update step
189 Double_t di = 0;
190
191 for (i = 0; i < k; i++) {
192 Double_t oldx = mx[i];
193 Double_t oldy = my[i];
194
195 mx[i] = x[0];
196 my[i] = y[0];
197 rk[i] = r[0][i];
198 for (j = 1; j < n; j++) {
199 Double_t xx = x[j];
200//
201// Here we have to take into acount the cylinder topology where phi is defined mod 2xpi
202// If two coordinates are separated by more than pi in phi one has to be shifted by +/- 2 pi
203
204 Double_t dx = mx[i] - x[j];
205 if (dx > TMath::Pi()) xx += 2. * TMath::Pi();
206 if (dx < -TMath::Pi()) xx -= 2. * TMath::Pi();
207 if (r[j][i] > 1.e-15) {
208 mx[i] = mx[i] * rk[i] + r[j][i] * xx;
209 my[i] = my[i] * rk[i] + r[j][i] * y[j];
210 rk[i] += r[j][i];
211 mx[i] /= rk[i];
212 my[i] /= rk[i];
213 }
214 if (mx[i] > 2. * TMath::Pi()) mx[i] -= 2. * TMath::Pi();
215 if (mx[i] < 0. ) mx[i] += 2. * TMath::Pi();
216 } // Data
217 di += d(mx[i], my[i], oldx, oldy);
218
219 } // means
220 //
221 // Sigma
222 for (i = 0; i < k; i++) {
223 sigma2[i] = 0.;
224 for (j = 0; j < n; j++) {
225 sigma2[i] += r[j][i] * d(mx[i], my[i], x[j], y[j]);
226 } // Data
227 sigma2[i] /= rk[i];
228 if (sigma2[i] < 0.0025) sigma2[i] = 0.0025;
229 } // Clusters
230 //
231 // Fractions
232 for (i = 0; i < k; i++) pi[i] = rk[i] / Double_t(n);
233 //
234// ending condition
235 if (di < 1.e-8 || nit > 1000) break;
236 } // while
237
238// Clean-up
239 delete[] nr;
240 delete[] pi;
241 for (j = 0; j < n; j++) delete[] r[j];
242 delete[] r;
243//
244 return (nit < 1000);
245}
246
247Int_t AliKMeansClustering::SoftKMeans3(Int_t k, Int_t n, Double_t* x, Double_t* y, Double_t* mx, Double_t* my ,
248 Double_t* sigmax2, Double_t* sigmay2, Double_t* rk )
249{
250 //
251 // The soft K-means algorithm
252 //
253 Int_t i,j;
254 //
255 // (1) Initialisation of the k means using k-means++ recipe
256 //
257 OptimalInit(k, n, x, y, mx, my);
258 //
259 // (2a) The responsibilities
260 Double_t** r = new Double_t*[n]; // responsibilities
261 for (j = 0; j < n; j++) {r[j] = new Double_t[k];}
262 //
263 // (2b) Normalisation
264 Double_t* nr = new Double_t[n];
265 //
266 // (2c) Weights
267 Double_t* pi = new Double_t[k];
268 //
269 //
270 // (2d) Initialise the responsibilties and weights
271 for (j = 0; j < n; j++) {
272 nr[j] = 0.;
273 for (i = 0; i < k; i++) {
274
275 r[j][i] = TMath::Exp(- fBeta * d(mx[i], my[i], x[j], y[j]));
276 nr[j] += r[j][i];
277 } // mean i
278 } // data point j
279
280 for (i = 0; i < k; i++) {
281 rk[i] = 0.;
282 sigmax2[i] = 1./fBeta;
283 sigmay2[i] = 1./fBeta;
284
285 for (j = 0; j < n; j++) {
286 r[j][i] /= nr[j];
287 rk[i] += r[j][i];
288 } // mean i
289 pi[i] = rk[i] / Double_t(n);
290 } // data point j
291 // (3) Iterations
292 Int_t nit = 0;
293
294 while(1) {
295 nit++;
296 //
297 // Assignment step
298 //
299 for (j = 0; j < n; j++) {
300 nr[j] = 0.;
301 for (i = 0; i < k; i++) {
302
303 Double_t dx = TMath::Abs(mx[i]-x[j]);
304 if (dx > TMath::Pi()) dx = 2. * TMath::Pi() - dx;
305 Double_t dy = TMath::Abs(my[i]-y[j]);
306 r[j][i] = pi[i] * TMath::Exp(-0.5 * (dx * dx / sigmax2[i] + dy * dy / sigmay2[i]))
307 / (2. * TMath::Sqrt(sigmax2[i] * sigmay2[i]) * TMath::Pi() * TMath::Pi());
308 nr[j] += r[j][i];
309 } // mean i
310 } // data point j
311
312 for (i = 0; i < k; i++) {
313 for (j = 0; j < n; j++) {
314 r[j][i] /= nr[j];
315 } // mean i
316 } // data point j
317
318 //
319 // Update step
320 Double_t di = 0;
321
322 for (i = 0; i < k; i++) {
323 Double_t oldx = mx[i];
324 Double_t oldy = my[i];
325
326 mx[i] = x[0];
327 my[i] = y[0];
328 rk[i] = r[0][i];
329 for (j = 1; j < n; j++) {
330 Double_t xx = x[j];
331//
332// Here we have to take into acount the cylinder topology where phi is defined mod 2xpi
333// If two coordinates are separated by more than pi in phi one has to be shifted by +/- 2 pi
334
335 Double_t dx = mx[i] - x[j];
336 if (dx > TMath::Pi()) xx += 2. * TMath::Pi();
337 if (dx < -TMath::Pi()) xx -= 2. * TMath::Pi();
338 if (r[j][i] > 1.e-15) {
339 mx[i] = mx[i] * rk[i] + r[j][i] * xx;
340 my[i] = my[i] * rk[i] + r[j][i] * y[j];
341 rk[i] += r[j][i];
342 mx[i] /= rk[i];
343 my[i] /= rk[i];
344 }
345 if (mx[i] > 2. * TMath::Pi()) mx[i] -= 2. * TMath::Pi();
346 if (mx[i] < 0. ) mx[i] += 2. * TMath::Pi();
347 } // Data
348 di += d(mx[i], my[i], oldx, oldy);
349
350 } // means
351 //
352 // Sigma
353 for (i = 0; i < k; i++) {
354 sigmax2[i] = 0.;
355 sigmay2[i] = 0.;
356
357 for (j = 0; j < n; j++) {
358 Double_t dx = TMath::Abs(mx[i]-x[j]);
359 if (dx > TMath::Pi()) dx = 2. * TMath::Pi() - dx;
360 Double_t dy = TMath::Abs(my[i]-y[j]);
361 sigmax2[i] += r[j][i] * dx * dx;
362 sigmay2[i] += r[j][i] * dy * dy;
363 } // Data
364 sigmax2[i] /= rk[i];
365 sigmay2[i] /= rk[i];
366 if (sigmax2[i] < 0.0025) sigmax2[i] = 0.0025;
367 if (sigmay2[i] < 0.0025) sigmay2[i] = 0.0025;
368 } // Clusters
369 //
370 // Fractions
371 for (i = 0; i < k; i++) pi[i] = rk[i] / Double_t(n);
372 //
373// ending condition
374 if (di < 1.e-8 || nit > 1000) break;
375 } // while
376
377// Clean-up
378 delete[] nr;
379 delete[] pi;
380 for (j = 0; j < n; j++) delete[] r[j];
381 delete[] r;
382//
383 return (nit < 1000);
384}
385
386Double_t AliKMeansClustering::d(Double_t mx, Double_t my, Double_t x, Double_t y)
387{
388 //
389 // Distance definition
390 // Quasi - Euclidian on the eta-phi cylinder
391
392 Double_t dx = TMath::Abs(mx-x);
393 if (dx > TMath::Pi()) dx = 2. * TMath::Pi() - dx;
394
395 return (0.5*(dx * dx + (my - y) * (my - y)));
396}
397
398
399
400void AliKMeansClustering::OptimalInit(Int_t k, Int_t n, const Double_t* x, const Double_t* y, Double_t* mx, Double_t* my)
401{
402 //
403 // Optimal initialisation using the k-means++ algorithm
404 // http://en.wikipedia.org/wiki/K-means%2B%2B
405 //
406 // k-means++ is an algorithm for choosing the initial values for k-means clustering in statistics and machine learning.
407 // It was proposed in 2007 by David Arthur and Sergei Vassilvitskii as an approximation algorithm for the NP-hard k-means problem---
408 // a way of avoiding the sometimes poor clusterings found by the standard k-means algorithm.
409 //
410 //
411 TH1F d2("d2", "", n, -0.5, Float_t(n)-0.5);
412 d2.Reset();
413
414 // (1) Chose first center as a random point among the input data.
415 Int_t ir = Int_t(Float_t(n) * gRandom->Rndm());
416 mx[0] = x[ir];
417 my[0] = y[ir];
418
419 // (2) Iterate
420 Int_t icl = 1;
421 while(icl < k)
422 {
423 // find min distance to existing clusters
424 for (Int_t j = 0; j < n; j++) {
425 Double_t dmin = 1.e10;
426 for (Int_t i = 0; i < icl; i++) {
427 Double_t dij = d(mx[i], my[i], x[j], y[j]);
428 if (dij < dmin) dmin = dij;
429 } // clusters
430 d2.Fill(Float_t(j), dmin);
431 } // data points
432 // select a new cluster from data points with probability ~d2
433 ir = Int_t(d2.GetRandom() + 0.5);
434 mx[icl] = x[ir];
435 my[icl] = y[ir];
436 icl++;
437 } // icl
438}
439
440
441ClassImp(AliKMeansResult)
442
443
444
445AliKMeansResult::AliKMeansResult(Int_t k):
446 TObject(),
447 fK(k),
448 fMx (new Double_t[k]),
449 fMy (new Double_t[k]),
450 fSigma2(new Double_t[k]),
451 fRk (new Double_t[k]),
452 fTarget(new Double_t[k]),
453 fInd (new Int_t[k])
454{
455// Constructor
456}
457
458AliKMeansResult::AliKMeansResult(const AliKMeansResult &res):
459 TObject(res),
460 fK(res.GetK()),
461 fMx(new Double_t[res.GetK()]),
462 fMy(new Double_t[res.GetK()]),
463 fSigma2(new Double_t[res.GetK()]),
464 fRk(new Double_t[res.GetK()]),
465 fTarget(new Double_t[res.GetK()]),
466 fInd(new Int_t[res.GetK()])
467{
468 // Copy constructor
469 for (Int_t i = 0; i <fK; i++) {
470 fMx[i] = (res.GetMx()) [i];
471 fMy[i] = (res.GetMy()) [i];
472 fSigma2[i] = (res.GetSigma2())[i];
473 fRk[i] = (res.GetRk()) [i];
474 fTarget[i] = (res.GetTarget())[i];
475 fInd[i] = (res.GetInd()) [i];
476 }
477}
478
479AliKMeansResult& AliKMeansResult::operator=(const AliKMeansResult& res)
480{
481 //
482 // Assignment operator
483 if (this != &res) {
484 TObject::operator=(res);
485 if (fK != res.fK) {
486 delete [] fMx;
487 delete [] fMy;
488 delete [] fSigma2;
489 delete [] fRk;
490 delete [] fTarget;
491 delete [] fInd;
492 fK = res.fK;
493 fMx = new Double_t[fK];
494 fMy = new Double_t[fK];
495 fSigma2 = new Double_t[fK];
496 fRk = new Double_t[fK];
497 fTarget = new Double_t[fK];
498 fInd = new Int_t[fK];
499 }
500
501 fK = res.fK;
502 memcpy(fMx, res.fMx, fK*sizeof(Double_t));
503 memcpy(fMy, res.fMy, fK*sizeof(Double_t));
504 memcpy(fSigma2, res.fSigma2, fK*sizeof(Double_t));
505 memcpy(fRk, res.fRk, fK*sizeof(Double_t));
506 memcpy(fTarget, res.fTarget, fK*sizeof(Double_t));
507 memcpy(fInd, res.fInd, fK*sizeof(Int_t));
508 }
509 return *this;
510}
511
512
513AliKMeansResult::~AliKMeansResult()
514{
515// Destructor
516 delete[] fMx;
517 delete[] fMy;
518 delete[] fSigma2;
519 delete[] fRk;
520 delete[] fInd;
521 delete[] fTarget;
522}
523
524void AliKMeansResult::Sort()
525{
526 // Build target array and sort
527 // Sort clusters
528 for (Int_t i = 0; i < fK; i++) {
529 if (fRk[i] > 2.9) {
530 fTarget[i] = fRk[i] / fSigma2[i];
531 }
532 else fTarget[i] = 0.;
533 }
534
535 TMath::Sort(fK, fTarget, fInd);
536}
537
538void AliKMeansResult::Sort(Int_t n, const Double_t* x, const Double_t* y)
539{
540 // Build target array and sort
541 for (Int_t i = 0; i < fK; i++)
542 {
543 Int_t nc = 0;
544 for (Int_t j = 0; j < n; j++)
545 {
546 if (2. * AliKMeansClustering::d(fMx[i], fMy[i], x[j], y[j]) < 2.28 * fSigma2[i]) nc++;
547 }
548
549 if (nc > 2) {
550 fTarget[i] = Double_t(nc) / (2.28 * fSigma2[i]);
551 } else {
552 fTarget[i] = 0.;
553 }
554 }
555
556 TMath::Sort(fK, fTarget, fInd);
557}
558
559void AliKMeansResult::CopyResults(const AliKMeansResult* res)
560{
561 fK = res->GetK();
562 for (Int_t i = 0; i <fK; i++) {
563 fMx[i] = (res->GetMx()) [i];
564 fMy[i] = (res->GetMy()) [i];
565 fSigma2[i] = (res->GetSigma2())[i];
566 fRk[i] = (res->GetRk()) [i];
567 fTarget[i] = (res->GetTarget())[i];
568 fInd[i] = (res->GetInd()) [i];
569 }
570}