Improving binning hanlding in the LTM robust estimator
authormivanov <marian.ivanov@cern.ch>
Thu, 16 Jan 2014 17:22:09 +0000 (18:22 +0100)
committermivanov <marian.ivanov@cern.ch>
Thu, 16 Jan 2014 17:22:09 +0000 (18:22 +0100)
adding the test function

STAT/Macros/TStatToolkitTest.C [new file with mode: 0644]
STAT/TStatToolkit.cxx

diff --git a/STAT/Macros/TStatToolkitTest.C b/STAT/Macros/TStatToolkitTest.C
new file mode 100644 (file)
index 0000000..d167948
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+  .L $ALICE_ROOT/STAT/Macros/TStatToolkitTest.C+
+
+*/
+
+#include "TH1.h"
+#include "TF1.h"
+#include "TMath.h"
+#include "TRandom.h"
+#include "TVectorD.h"
+#include "TStatToolkit.h"
+#include "TTreeStream.h"
+#include "TLegend.h"
+#include "TGraphErrors.h"
+#include "TCut.h"
+#include "TCanvas.h"
+
+TObjArray arrayFit(3);
+Int_t kMarkers[10]={25,24,20,21};
+Int_t kColors[10]={1,2,4,3};
+const char * names[10] = {"LTM","LThisto","LTMhisto1"};
+const char * distNames[10] = {"Gauss","Gauss+flat","Gauss(m)+0.2*Gauss(m+5#sigma)","Gauss(m)+0.3*Gauss(m+3#sigma)"};
+
+
+void TestLTM(Int_t nevents=10000){
+  //
+  // Goal test and benchamerk numerical stability and precission of the LTM method
+  // Binned data and not binned data: 
+  // Here we assume that binwidth<sigma
+  // Distribution types examples used in test:
+  //
+  //   0 - gaussian
+  //   1 - gauss+uniform background
+  //   2 - gauss+second gaus 5 sigma away 
+  //   
+  Int_t npointsMax=1000;
+  TTreeSRedirector * pcstream = new TTreeSRedirector("TStatToolkit_LTMtest.root","recreate");
+  //
+  TVectorD values(npointsMax);
+  TVectorD vecLTM(6);
+  TVectorD meanLTM(6);
+  TVectorD sigmaLTM(6);
+  TVectorD meanLTMHisto(6);
+  TVectorD sigmaLTMHisto(6);
+  TVectorD meanLTMHisto1(6);
+  TVectorD sigmaLTMHisto1(6);
+  TVectorD paramLTM(10);
+  //
+  for (Int_t iltm=0; iltm<6; iltm++) vecLTM[iltm]=0.99999*(0.5+iltm/10.);
+  TF1 fg("fg","gaus");
+  for (Int_t ievent = 0; ievent<nevents; ievent++){
+    if (ievent%1000==0) printf("%d\n",ievent);
+    Int_t distType=Int_t(gRandom->Rndm()*4);
+    Int_t npoints= 50+(npointsMax-50)*gRandom->Rndm();
+    Double_t mean  = 0.5+(gRandom->Rndm()-0.5)*0.2;
+    Double_t sigma = mean*0.2*(1+2*gRandom->Rndm());
+    TH1F histo("histo","histo",100,-0.5,1.5);
+    //
+    for (Int_t ipoint=0; ipoint<npoints; ipoint++){
+      Double_t value=0;
+      if (distType==0) value=gRandom->Gaus(mean,sigma); // gauss
+      if (distType==1) {
+       if (gRandom->Rndm()>0.2) {                     //  gauss + background
+         value=gRandom->Gaus(mean,sigma);
+       }else{
+         value=mean+(gRandom->Rndm()-0.5)*2;
+       }
+      }
+      if (distType==2) {
+       if (gRandom->Rndm()>0.2) {                     //  gauss + second gaus 5 sigma away
+         value=gRandom->Gaus(mean,sigma);
+       }else{
+         value=gRandom->Gaus(mean+5*sigma,sigma);
+       }
+      }
+      if (distType==3) {
+       if (gRandom->Rndm()>0.3) {                     //  gauss + second gaus 4 sigma away
+         value=gRandom->Gaus(mean,sigma);
+       }else{
+         value=gRandom->Gaus(mean+5*sigma,sigma);
+       }
+      }
+      values[ipoint]=value;
+      histo.Fill(value);
+    }
+    //
+    histo.Fit(&fg,"QN","QN");
+    Double_t meanG = fg.GetParameter(1);
+    Double_t rmsG = fg.GetParameter(2);
+    Double_t meanA  = TMath::Mean(npoints,values.GetMatrixArray());
+    Double_t rmsA   = TMath::Mean(npoints,values.GetMatrixArray());
+    Double_t meanH  = histo.GetMean();
+    Double_t rmsH  = histo.GetRMS();
+    //
+    for (Int_t iltm=0; iltm<6; iltm++){
+      //    
+      Double_t meanV,sigmaV=0;
+      TStatToolkit::EvaluateUni(npoints,values.GetMatrixArray(), meanV,sigmaV, vecLTM[iltm]*npoints);
+      meanLTM[iltm]=meanV;
+      sigmaLTM[iltm]=sigmaV;
+      //
+      TStatToolkit::LTM(&histo, &paramLTM,  vecLTM[iltm]);
+      meanLTMHisto1[iltm]=paramLTM[1];
+      sigmaLTMHisto1[iltm]=paramLTM[2];
+      //
+      TStatToolkit::LTMHisto(&histo, paramLTM,  vecLTM[iltm]);
+      meanLTMHisto[iltm]=paramLTM[1];
+      sigmaLTMHisto[iltm]=paramLTM[2];
+    }
+    (*pcstream)<<"ltm"<<
+      //Input
+      "npoints="<<npoints<<
+      "distType="<<distType<<
+      "mean="<<mean<<
+      "sigma="<<sigma<<
+      // "Standart" statistic output
+      "meanA="<<meanA<<
+      "rmsA="<<rmsA<<
+      "meanH="<<meanH<<
+      "rmsH="<<rmsH<<
+      "meanG="<<meanG<<
+      "rmsG="<<rmsG<<
+      // "LTM" output
+      "vecLTM.="<<&vecLTM<<
+      "meanLTM.="<<&meanLTM<<
+      "meanLTMHisto.="<<&meanLTMHisto<<
+      "meanLTMHisto1.="<<&meanLTMHisto1<<
+      //
+      "sigmaLTM.="<<&sigmaLTM<<
+      "sigmaLTMHisto.="<<&sigmaLTMHisto<<
+      "sigmaLTMHisto1.="<<&sigmaLTMHisto1<<      
+      "\n";
+  }
+  delete pcstream;
+  //
+  //
+  //
+  TFile *fltm = TFile::Open("TStatToolkit_LTMtest.root","update");
+  TTree * tree = (TTree*)fltm->Get("ltm");
+  tree->SetMarkerSize(0.5);
+  tree->SetMarkerStyle(25);
+  //
+  // 1. Get numerical error estimate of the LTM method for gaussian distribution  
+  //
+  TH2 * hisLTMTrunc[10]={0};  
+  TH1 * hisResol[10]={0};
+  TGraphErrors * grSigma[10]={0};
+  TCanvas *canvasLTM= new TCanvas("canvasLTM","canvasLTM",800,700);
+  canvasLTM->Divide(2,2);
+  for (Int_t itype=0; itype<4; itype++){
+    canvasLTM->cd(itype+1);
+    TCut cutType=TString::Format("distType==0%d",itype).Data();
+    tree->Draw("sqrt(npoints-2)*(meanLTM.fElements-mean)/sigma:vecLTM.fElements>>hisLTM(6,0.45,1.05,100,-10,10)",cutType+"npoints>50&&sigma>0.1","colzgof");
+    hisLTMTrunc[0]= (TH2*)(tree->GetHistogram()->Clone());
+    tree->Draw("sqrt(npoints-2)*(meanLTMHisto.fElements-mean)/sigma:vecLTM.fElements>>hisLTMHisto(6,0.45,1.05,100,-10,10)",cutType+"npoints>50&&sigma>0.1","colzgoff");
+    hisLTMTrunc[1]= (TH2*)tree->GetHistogram()->Clone();
+    tree->Draw("sqrt(npoints-2)*(meanLTMHisto1.fElements-mean)/sigma:vecLTM.fElements>>hisLTMHist1(6,0.45,1.05,100,-10,10)",cutType+"npoints>50&&sigma>0.1","colzgoff");
+    hisLTMTrunc[2]= (TH2*)tree->GetHistogram()->Clone();
+    TLegend * legend = new TLegend(0.5,0.7,0.9,0.9,distNames[itype]);
+    legend->SetBorderSize(0);
+    for (Int_t ihis=0; ihis<3; ihis++){
+      // MakeStat1D(TH2 * his, Int_t deltaBin, Double_t fraction, Int_t returnType, Int_t markerStyle, Int_t markerColor);
+      grSigma[ihis]=TStatToolkit::MakeStat1D( hisLTMTrunc[ihis],0,0.99,5,kMarkers[ihis],kColors[ihis]);
+      grSigma[ihis]->GetXaxis()->SetTitle("LTM fraction");
+      grSigma[ihis]->GetYaxis()->SetTitle("#sigma_{meas}/sigma_{gauss}");
+      if (ihis==0)grSigma[ihis]->Draw("alp");
+      if (ihis>0)grSigma[ihis]->Draw("lp");
+      legend->AddEntry(grSigma[ihis],names[ihis],"p");
+    }
+    legend->Draw();
+  }
+  canvasLTM->SaveAs("robustStatLTM_Performance.pdf");
+
+
+}
index 1a856fb..2ac19dc 100644 (file)
@@ -276,11 +276,13 @@ void TStatToolkit::LTM(TH1 * his, TVectorD *param , Float_t fraction,  Bool_t ve
   Double_t *data  = new Double_t[nentries];
   Int_t npoints=0;
   for (Int_t ibin=1;ibin<nbins; ibin++){
-    Float_t entriesI = his->GetBinContent(ibin);
-    Float_t xcenter= his->GetBinCenter(ibin);
+    Double_t entriesI = his->GetBinContent(ibin);
+    //Double_t xcenter= his->GetBinCenter(ibin);
+    Double_t x0 =  his->GetXaxis()->GetBinLowEdge(ibin);
+    Double_t w  =  his->GetXaxis()->GetBinWidth(ibin);
     for (Int_t ic=0; ic<entriesI; ic++){
       if (npoints<nentries){
-       data[npoints]= xcenter;
+       data[npoints]= x0+w*Double_t((ic+0.5)/entriesI);
        npoints++;
       }
     }
@@ -324,28 +326,35 @@ void TStatToolkit::MedianFilter(TH1 * his1D, Int_t nmedian){
 Bool_t TStatToolkit::LTMHisto(TH1 *his1D, TVectorD &params , Float_t fraction){
   //
   // LTM : Trimmed mean on histogram - Modified version for binned data
+  // 
   // Robust statistic to estimate properties of the distribution
-  // See http://en.wikipedia.org/w/index.php?title=Trimmed_estimator&oldid=582847999
+  // To handle binning error special treatment
+  // for definition of unbinned data see:
+  //     http://en.wikipedia.org/w/index.php?title=Trimmed_estimator&oldid=582847999
   //
-  // Paramters:
+  // Function parameters:
   //     his1D   - input histogram
   //     params  - vector with parameters
   //             - 0 - area
   //             - 1 - mean
-  //             - 2 - rms
+  //             - 2 - rms 
   //             - 3 - error estimate of mean
-  //             - 4 - dummy
+  //             - 4 - error estimate of RMS
   //             - 5 - first accepted bin position
   //             - 6 - last accepted  bin position
   //
   Int_t nbins    = his1D->GetNbinsX();
   Int_t nentries = (Int_t)his1D->GetEntries();
+  const Double_t kEpsilon=0.0000000001;
+
   if (nentries<=0) return 0;
+  if (fraction>1) fraction=0;
+  if (fraction<0) return 0;
   TVectorD vectorX(nbins);
   TVectorD vectorMean(nbins);
   TVectorD vectorRMS(nbins);
   Double_t sumCont=0;
-  for (Int_t ibin0=1; ibin0<nbins; ibin0++) sumCont+=his1D->GetBinContent(ibin0);
+  for (Int_t ibin0=1; ibin0<=nbins; ibin0++) sumCont+=his1D->GetBinContent(ibin0);
   //
   Double_t minRMS=his1D->GetRMS()*10000;
   Int_t maxBin=0;
@@ -359,19 +368,66 @@ Bool_t TStatToolkit::LTMHisto(TH1 *his1D, TVectorD &params , Float_t fraction){
       sum0+=cont;
       sum1+=cont*x;
       sum2+=cont*x*x;
-      if (sum0>fraction*sumCont) break;
+      if ( (ibin0!=ibin1) && sum0>=fraction*sumCont) break;
     }
     vectorX[ibin0]=his1D->GetBinCenter(ibin0);
-    if (sum0>fraction*sumCont){
+    if (sum0<fraction*sumCont) continue;
+    //
+    // substract fractions of bin0 and bin1 to keep sum0=fration*sumCont
+    //
+    Double_t diff = sum0-fraction*sumCont;
+    Double_t mean = sum1/sum0;
+    //
+    Double_t x0=his1D->GetBinCenter(ibin0);
+    Double_t x1=his1D->GetBinCenter(ibin1);
+    Double_t y0=his1D->GetBinContent(ibin0);
+    Double_t y1=his1D->GetBinContent(ibin1);
+    //
+    Double_t d = y0+y1-diff;    //enties to keep 
+    Double_t w0=0,w1=0;
+    if (y0<=kEpsilon&&y1>kEpsilon){
+      w1=d/y1;
+    } 
+    if (y1<=kEpsilon&&y0>kEpsilon){
+      w0=d/y0;
+    }
+    if (y0>kEpsilon && y1>kEpsilon && x1>x0  ){
+      w0 = (d*(x1-mean))/((x1-x0)*y0);
+      w1 = (d-y0*w0)/y1;
+      //
+      if (w0>1) {w1+=(w0-1)*y0/y1; w0=1;}
+      if (w1>1) {w0+=(w1-1)*y1/y0; w1=1;}
+    }  
+    if ( (x1>x0) &&TMath::Abs(y0*w0+y1*w1-d)>kEpsilon*sum0){
+      printf(" TStatToolkit::LTMHisto error\n");
+    }
+    sum0-=y0+y1;
+    sum1-=y0*x0;
+    sum1-=y1*x1;
+    sum2-=y0*x0*x0;
+    sum2-=y1*x1*x1;
+    //
+    Double_t xx0=his1D->GetXaxis()->GetBinUpEdge(ibin0)-0.5*w0*his1D->GetBinWidth(ibin0);
+    Double_t xx1=his1D->GetXaxis()->GetBinLowEdge(ibin1)+0.5*w1*his1D->GetBinWidth(ibin1);
+    sum0+=y0*w0+y1*w1;
+    sum1+=y0*w0*xx0;
+    sum1+=y1*w1*xx1;
+    sum2+=y0*w0*xx0*xx0;
+    sum2+=y1*w1*xx1*xx1;
+
+    //
+    // choose the bin with smallest rms
+    //
+    if (sum0>0){
       vectorMean[ibin0]=sum1/sum0;
-      vectorRMS[ibin0]=TMath::Sqrt(sum2/sum0-vectorMean[ibin0]*vectorMean[ibin0]);
+      vectorRMS[ibin0]=TMath::Sqrt(TMath::Abs(sum2/sum0-vectorMean[ibin0]*vectorMean[ibin0]));
       if (vectorRMS[ibin0]<minRMS){
        minRMS=vectorRMS[ibin0];
        params[0]=sum0;
        params[1]=vectorMean[ibin0];
        params[2]=vectorRMS[ibin0];
        params[3]=vectorRMS[ibin0]/TMath::Sqrt(sumCont*fraction);
-       params[4]=0;
+       params[4]=0; // what is the formula for error of RMS???
        params[5]=ibin0;
        params[6]=ibin1;
        params[7]=his1D->GetBinCenter(ibin0);
@@ -383,7 +439,6 @@ Bool_t TStatToolkit::LTMHisto(TH1 *his1D, TVectorD &params , Float_t fraction){
     }
   }
   return kTRUE;
-
 }
 
 
@@ -845,8 +900,8 @@ TGraphErrors * TStatToolkit::MakeStat1D(TH2 * his, Int_t deltaBin, Double_t frac
       err = projection->GetRMSError();
     }
     if (returnType==2 || returnType==3){
-      if (returnType==2) stat= vecLTM[1];
-      if (returnType==3) stat= vecLTM[2];      
+      if (returnType==2) {stat= vecLTM[1];  err =projection->GetRMSError();}
+       if (returnType==3) {stat= vecLTM[2];     err =projection->GetRMSError();}
     }
     if (returnType==4|| returnType==5){
       projection->Fit(&f1,"QN","QN", vecLTM[7], vecLTM[8]);