JETAN/AliKMeansClustering.cxx

   1 /**************************************************************************
   2  * Copyright(c) 1998-1999, ALICE Experiment at CERN, All rights reserved. *
   3  *                                                                        *
   4  * Author: The ALICE Off-line Project.                                    *
   5  * Contributors are mentioned in the code where appropriate.              *
   6  *                                                                        *
   7  * Permission to use, copy, modify and distribute this software and its   *
   8  * documentation strictly for non-commercial purposes is hereby granted   *
   9  * without fee, provided that the above copyright notice appears in all   *
  10  * copies and that both the copyright notice and this permission notice   *
  11  * appear in the supporting documentation. The authors make no claims     *
  12  * about the suitability of this software for any purpose. It is          *
  13  * provided "as is" without express or implied warranty.                  *
  14  **************************************************************************/
  15
  16 // Implemenatation of the K-Means Clustering Algorithm
  17 // See http://en.wikipedia.org/wiki/K-means_clustering and references therein.
  18 //
  19 // This particular implementation is the so called Soft K-means algorithm.
  20 // It has been modified to work on the cylindrical topology in eta-phi space.
  21 //
  22 // Author: Andreas Morsch (CERN)
  23 // andreas.morsch@cern.ch
  24
  25 #include "AliKMeansClustering.h"
  26 #include <TMath.h>
  27 #include <TRandom.h>
  28 #include <TH1F.h>
  29
  30 ClassImp(AliKMeansClustering)
  31
  32 Double_t AliKMeansClustering::fBeta = 10.;
  33
  34
  35 Int_t AliKMeansClustering::SoftKMeans(Int_t k, Int_t n, Double_t* x, Double_t* y, Double_t* mx, Double_t* my , Double_t* rk )
  36 {
  37     //
  38     // The soft K-means algorithm
  39     //
  40     Int_t i,j;
  41     //
  42     // (1) Initialisation of the k means
  43
  44     for (i = 0; i < k; i++) {
  45         mx[i] = 2. * TMath::Pi() * gRandom->Rndm();
  46         my[i] = -1. + 2. * gRandom->Rndm();
  47     }
  48
  49     //
  50     // (2a) The responsibilities
  51     Double_t** r = new Double_t*[n]; // responsibilities
  52     for (j = 0; j < n; j++) {r[j] = new Double_t[k];}
  53     //
  54     // (2b) Normalisation
  55     Double_t* nr   = new Double_t[n];
  56     // (3) Iterations
  57     Int_t nit = 0;
  58
  59     while(1) {
  60         nit++;
  61       //
  62       // Assignment step
  63       //
  64       for (j = 0; j < n; j++) {
  65         nr[j] = 0.;
  66         for (i = 0; i < k; i++) {
  67           r[j][i] = TMath::Exp(- fBeta * d(mx[i], my[i], x[j], y[j]));
  68           nr[j] += r[j][i];
  69         } // mean i
  70       } // data point j
  71
  72       for (j = 0; j < n; j++) {
  73         for (i = 0; i < k; i++) {
  74           r[j][i] /=  nr[j];
  75         } // mean i
  76       } // data point j
  77
  78         //
  79         // Update step
  80       Double_t di = 0;
  81
  82       for (i = 0; i < k; i++) {
  83           Double_t oldx = mx[i];
  84           Double_t oldy = my[i];
  85
  86           mx[i] = x[0];
  87           my[i] = y[0];
  88           rk[i] = r[0][i];
  89
  90         for (j = 1; j < n; j++) {
  91             Double_t xx =  x[j];
  92 //
  93 // Here we have to take into acount the cylinder topology where phi is defined mod 2xpi
  94 // If two coordinates are separated by more than pi in phi one has to be shifted by +/- 2 pi
  95
  96             Double_t dx = mx[i] - x[j];
  97             if (dx >  TMath::Pi()) xx += 2. * TMath::Pi();
  98             if (dx < -TMath::Pi()) xx -= 2. * TMath::Pi();
  99             mx[i] = mx[i] * rk[i] + r[j][i] * xx;
 100             my[i] = my[i] * rk[i] + r[j][i] * y[j];
 101             rk[i] += r[j][i];
 102             mx[i] /= rk[i];
 103             my[i] /= rk[i];
 104             if (mx[i] > 2. * TMath::Pi()) mx[i] -= 2. * TMath::Pi();
 105             if (mx[i] < 0.              ) mx[i] += 2. * TMath::Pi();
 106         } // Data
 107         di += d(mx[i], my[i], oldx, oldy);
 108       } // means
 109         //
 110         // ending condition
 111       if (di < 1.e-8 || nit > 1000) break;
 112     } // while
 113
 114 // Clean-up
 115     delete[] nr;
 116     for (j = 0; j < n; j++) delete[] r[j];
 117     delete[] r;
 118 //
 119     return (nit < 1000);
 120
 121 }
 122
 123 Int_t AliKMeansClustering::SoftKMeans2(Int_t k, Int_t n, Double_t* x, Double_t* y, Double_t* mx, Double_t* my , Double_t* sigma2, Double_t* rk )
 124 {
 125     //
 126     // The soft K-means algorithm
 127     //
 128     Int_t i,j;
 129     //
 130     // (1) Initialisation of the k means using k-means++ recipe
 131     //
 132      OptimalInit(k, n, x, y, mx, my);
 133     //
 134     // (2a) The responsibilities
 135     Double_t** r = new Double_t*[n]; // responsibilities
 136     for (j = 0; j < n; j++) {r[j] = new Double_t[k];}
 137     //
 138     // (2b) Normalisation
 139     Double_t* nr = new Double_t[n];
 140     //
 141     // (2c) Weights
 142     Double_t* pi = new Double_t[k];
 143     //
 144     //
 145     // (2d) Initialise the responsibilties and weights
 146     for (j = 0; j < n; j++) {
 147       nr[j] = 0.;
 148       for (i = 0; i < k; i++) {
 149         r[j][i] = TMath::Exp(- fBeta * d(mx[i], my[i], x[j], y[j]));
 150         nr[j] += r[j][i];
 151       } // mean i
 152     } // data point j
 153
 154     for (i = 0; i < k; i++) {
 155       rk[i]    = 0.;
 156       sigma2[i] = 1./fBeta;
 157
 158       for (j = 0; j < n; j++) {
 159         r[j][i] /=  nr[j];
 160         rk[i] += r[j][i];
 161       } // mean i
 162       pi[i] = rk[i] / Double_t(n);
 163     } // data point j
 164     // (3) Iterations
 165     Int_t nit = 0;
 166     Bool_t rmovalStep = kFALSE;
 167
 168     while(1) {
 169         nit++;
 170       //
 171       // Assignment step
 172       //
 173       for (j = 0; j < n; j++) {
 174         nr[j] = 0.;
 175         for (i = 0; i < k; i++) {
 176           r[j][i] = pi[i] * TMath::Exp(- d(mx[i], my[i], x[j], y[j]) / sigma2[i] )
 177             / (2. * sigma2[i] * TMath::Pi() * TMath::Pi());
 178           nr[j] += r[j][i];
 179         } // mean i
 180       } // data point j
 181
 182       for (i = 0; i < k; i++) {
 183         for (j = 0; j < n; j++) {
 184           r[j][i] /=  nr[j];
 185         } // mean i
 186       } // data point j
 187
 188         //
 189         // Update step
 190       Double_t di = 0;
 191
 192       for (i = 0; i < k; i++) {
 193           Double_t oldx = mx[i];
 194           Double_t oldy = my[i];
 195
 196           mx[i] = x[0];
 197           my[i] = y[0];
 198           rk[i] = r[0][i];
 199         for (j = 1; j < n; j++) {
 200             Double_t xx =  x[j];
 201 //
 202 // Here we have to take into acount the cylinder topology where phi is defined mod 2xpi
 203 // If two coordinates are separated by more than pi in phi one has to be shifted by +/- 2 pi
 204
 205             Double_t dx = mx[i] - x[j];
 206             if (dx >  TMath::Pi()) xx += 2. * TMath::Pi();
 207             if (dx < -TMath::Pi()) xx -= 2. * TMath::Pi();
 208             if (r[j][i] > 1.e-15) {
 209               mx[i] = mx[i] * rk[i] + r[j][i] * xx;
 210               my[i] = my[i] * rk[i] + r[j][i] * y[j];
 211               rk[i] += r[j][i];
 212               mx[i] /= rk[i];
 213               my[i] /= rk[i];
 214             }
 215             if (mx[i] > 2. * TMath::Pi()) mx[i] -= 2. * TMath::Pi();
 216             if (mx[i] < 0.              ) mx[i] += 2. * TMath::Pi();
 217         } // Data
 218         di += d(mx[i], my[i], oldx, oldy);
 219
 220       } // means
 221       //
 222       // Sigma
 223       for (i = 0; i < k; i++) {
 224         sigma2[i] = 0.;
 225         for (j = 1; j < n; j++) {
 226           sigma2[i] += r[j][i] * d(mx[i], my[i], x[j], y[j]);
 227         } // Data
 228         sigma2[i] /= rk[i];
 229         if (sigma2[i] < 0.0025) sigma2[i] = 0.0025;
 230       } // Clusters
 231       //
 232       // Fractions
 233       for (i = 0; i < k; i++) pi[i] = rk[i] / Double_t(n);
 234       //
 235 // ending condition
 236       if (di < 1.e-8 || nit > 1000) break;
 237     } // while
 238
 239 // Clean-up
 240     delete[] nr;
 241     delete[] pi;
 242     for (j = 0; j < n; j++) delete[] r[j];
 243     delete[] r;
 244 //
 245     return (nit < 1000);
 246 }
 247
 248 Double_t AliKMeansClustering::d(Double_t mx, Double_t my, Double_t x, Double_t y)
 249 {
 250     //
 251     // Distance definition
 252     // Quasi - Euclidian on the eta-phi cylinder
 253
 254     Double_t dx = TMath::Abs(mx-x);
 255     if (dx > TMath::Pi()) dx = 2. * TMath::Pi() - dx;
 256
 257     return (0.5*(dx * dx + (my - y) * (my - y)));
 258 }
 259
 260
 261
 262 void AliKMeansClustering::OptimalInit(Int_t k, Int_t n, Double_t* x, Double_t* y, Double_t* mx, Double_t* my)
 263 {
 264   //
 265   // Optimal initialisation using the k-means++ algorithm
 266   // http://en.wikipedia.org/wiki/K-means%2B%2B
 267   //
 268   // k-means++ is an algorithm for choosing the initial values for k-means clustering in statistics and machine learning.
 269   // It was proposed in 2007 by David Arthur and Sergei Vassilvitskii as an approximation algorithm for the NP-hard k-means problem---
 270   // a way of avoiding the sometimes poor clusterings found by the standard k-means algorithm.
 271   //
 272   //
 273   TH1F d2("d2", "", n, -0.5, Float_t(n)-0.5);
 274   d2.Reset();
 275
 276   // (1) Chose first center as a random point among the input data.
 277   Int_t ir = Int_t(Float_t(n) * gRandom->Rndm());
 278   mx[0] = x[ir];
 279   my[0] = y[ir];
 280
 281   // (2) Iterate
 282   Int_t icl = 1;
 283   while(icl < k)
 284     {
 285       // find min distance to existing clusters
 286       for (Int_t j = 0; j < n; j++) {
 287         Double_t dmin = 1.e10;
 288         for (Int_t i = 0; i < icl; i++) {
 289           Double_t dij = d(mx[i], my[i], x[j], y[j]);
 290           if (dij < dmin) dmin = dij;
 291         } // clusters
 292         d2.Fill(Float_t(j), dmin);
 293       } // data points
 294       // select a new cluster from data points with probability ~d2
 295       ir = Int_t(d2.GetRandom() + 0.5);
 296       mx[icl] = x[ir];
 297       my[icl] = y[ir];
 298       icl++;
 299     } // icl
 300 }
 301
 302