Update NVIDIA GPU Tracking library to be compatible to AliRoot patch 64473, add preli...
authordrohr <drohr@f7af4fe6-9843-0410-8265-dc069ae4e863>
Sun, 27 Oct 2013 11:35:32 +0000 (11:35 +0000)
committerdrohr <drohr@f7af4fe6-9843-0410-8265-dc069ae4e863>
Sun, 27 Oct 2013 11:35:32 +0000 (11:35 +0000)
22 files changed:
HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAMath.h
HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h
HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.cxx [new file with mode: 0644]
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.h [new file with mode: 0644]
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerCommon.h [new file with mode: 0644]
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch [deleted file]
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.h
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cl [new file with mode: 0644]
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cxx [new file with mode: 0644]
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.h [new file with mode: 0644]
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCLInternals.h [new file with mode: 0644]
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCATrackletConstructorGPU.h
HLT/TPCLib/tracking-ca/cagpu/makefile
HLT/TPCLib/tracking-ca/cagpu/makefiles/include.S [new file with mode: 0644]
HLT/TPCLib/tracking-ca/cagpu/makefiles/makefile_opencl_compiler.cpp [new file with mode: 0644]
HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_compiler_structs.h [new file with mode: 0644]
HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_obtain_program.h [new file with mode: 0644]

index 65ff3f4..9e34bc3 100644 (file)
@@ -42,7 +42,7 @@
 #endif //VSNET_RUNTIME
 #endif //R__WIN32
 
-#ifdef HLTCA_STANDALONE
+#if defined(HLTCA_STANDALONE) || (defined(HLTCA_GPUCODE) && defined(__OPENCL__) && !defined(HLTCA_HOSTCODE))
 
 // class TObject{};
 
index fa5cfcc..e24b439 100644 (file)
@@ -72,7 +72,7 @@ typedef AliHLTTPCCAMath CAMath;
 #if defined( HLTCA_STANDALONE )
 #define choiceA(c1,c2,c3) c2
 #else //HLTCA_STANDALONE
-#define choiceA(c1,c2,c3) c3
+#define choiceA(c1,c2,c3) c2
 #endif //HLTCA_STANDALONE
 #else //HLTCA_HOSTCODE
 #define choiceA(c1, c2, c3) c2
index 14914cd..04db579 100644 (file)
@@ -20,7 +20,9 @@
 #include "AliHLTTPCCADef.h"
 #include "AliHLTTPCCARow.h"
 #include "AliHLTTPCCAMath.h"
+#if !(defined(HLTCA_GPUCODE) && defined(__OPENCL__) && !defined(HLTCA_HOSTCODE))
 #include "AliHLTArray.h"
+#endif
 #include "AliHLTTPCCAGPUConfig.h"
 
 typedef int int_v;
index 387d76f..bae170b 100644 (file)
@@ -24,7 +24,6 @@
 #include "AliHLTTPCCAMath.h"
 #include "MemoryAssignmentHelpers.h"
 
-#include "TStopwatch.h"
 #include "AliHLTTPCCAHitArea.h"
 #include "AliHLTTPCCANeighboursFinder.h"
 #include "AliHLTTPCCANeighboursCleaner.h"
@@ -39,6 +38,7 @@
 #include "AliHLTTPCCAGPUConfig.h"
 
 #if !defined(HLTCA_GPUCODE)
+#include "TStopwatch.h"
 #include <iostream>
 #include <iomanip>
 #include <string.h>
index 3f353d3..7abf200 100644 (file)
@@ -30,7 +30,9 @@ MEM_CLASS_PRE() class AliHLTTPCCATrackParam;
 class AliHLTTPCCAClusterData;
 MEM_CLASS_PRE() class AliHLTTPCCARow;
 
+#if !(defined(HLTCA_GPUCODE) && defined(__OPENCL__) && !defined(HLTCA_HOSTCODE))
 #include "TStopwatch.h"
+#endif
 
 /**
  * @class AliHLTTPCCATracker
index 2d82fe3..bcfbd7d 100644 (file)
@@ -229,9 +229,9 @@ AliHLTTPCCATrackerFramework::AliHLTTPCCATrackerFramework(int allowGPU, const cha
                if (allowGPU)
                {
                        #ifndef R__WIN32
-                               HLTInfo("The following error occured during dlopen: %s", dlerror());
+                               HLTImportant("The following error occured during dlopen: %s", dlerror());
                        #endif
-                       HLTError("Error Opening cagpu library for GPU Tracker, will fallback to CPU");
+                       HLTError("Error Opening cagpu library for GPU Tracker (%s), will fallback to CPU", GPU_Library == NULL ? "default: " GPULIBNAME : GPU_Library);
                }
                else
                {
diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.cxx b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.cxx
new file mode 100644 (file)
index 0000000..06a0861
--- /dev/null
@@ -0,0 +1,1030 @@
+// **************************************************************************
+// This file is property of and copyright by the ALICE HLT Project          *
+// ALICE Experiment at CERN, All rights reserved.                           *
+//                                                                          *
+// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
+//                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
+//                                     David Rohr <drohr@kip.uni-heidelberg.de>                                *
+//                  for The ALICE HLT Project.                              *
+//                                                                          *
+// Permission to use, copy, modify and distribute this software and its     *
+// documentation strictly for non-commercial purposes is hereby granted     *
+// without fee, provided that the above copyright notice appears in all     *
+// copies and that both the copyright notice and this permission notice     *
+// appear in the supporting documentation. The authors make no claims       *
+// about the suitability of this software for any purpose. It is            *
+// provided "as is" without express or implied warranty.                    *
+//                                                                          *
+//***************************************************************************
+
+#include <string.h>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+#include "AliHLTTPCCAGPUTrackerBase.h"
+#include "AliHLTTPCCAClusterData.h"
+#include "AliHLTTPCCAGPUTrackerCommon.h"
+
+ClassImp( AliHLTTPCCAGPUTrackerBase )
+
+int AliHLTTPCCAGPUTrackerBase::GlobalTracking(int iSlice, int threadId, AliHLTTPCCAGPUTrackerBase::helperParam* hParam)
+{
+       if (fDebugLevel >= 3) {HLTDebug("GPU Tracker running Global Tracking for slice %d on thread %d\n", iSlice, threadId);}
+
+       int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
+       int sliceRight = (iSlice + 1) % (fgkNSlices / 2);
+       if (iSlice >= fgkNSlices / 2)
+       {
+               sliceLeft += fgkNSlices / 2;
+               sliceRight += fgkNSlices / 2;
+       }
+       while (fSliceOutputReady < iSlice || fSliceOutputReady < sliceLeft || fSliceOutputReady < sliceRight)
+       {
+               if (hParam != NULL && hParam->fReset) return(1);
+       }
+
+       pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
+       pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
+       fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS);
+       pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
+       pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
+
+       fSliceLeftGlobalReady[sliceLeft] = 1;
+       fSliceRightGlobalReady[sliceRight] = 1;
+       if (fDebugLevel >= 3) {HLTDebug("GPU Tracker finished Global Tracking for slice %d on thread %d\n", iSlice, threadId);}
+       return(0);
+}
+
+void* AliHLTTPCCAGPUTrackerBase::helperWrapper(void* arg)
+{
+       AliHLTTPCCAGPUTrackerBase::helperParam* par = (AliHLTTPCCAGPUTrackerBase::helperParam*) arg;
+       AliHLTTPCCAGPUTrackerBase* cls = par->fCls;
+
+       AliHLTTPCCATracker* tmpTracker = new AliHLTTPCCATracker;
+
+#ifdef HLTCA_STANDALONE
+       if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d starting", par->fNum);
+#endif
+
+#if defined(HLTCA_STANDALONE) & !defined(_WIN32)
+       cpu_set_t mask;
+       CPU_ZERO(&mask);
+       CPU_SET(par->fNum * 2 + 2, &mask);
+       //sched_setaffinity(0, sizeof(mask), &mask);
+#endif
+
+       while(pthread_mutex_lock(&((pthread_mutex_t*) par->fMutex)[0]) == 0 && par->fTerminate == false)
+       {
+               if (par->CPUTracker)
+               {
+                       for (int i = 0;i < cls->fNSlicesPerCPUTracker;i++)
+                       {
+                               int myISlice = cls->fSliceCount - cls->fNCPUTrackers * cls->fNSlicesPerCPUTracker + (par->fNum - cls->fNHelperThreads) * cls->fNSlicesPerCPUTracker + i;
+#ifdef HLTCA_STANDALONE
+                               if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Doing full CPU tracking, Slice %d", par->fNum, myISlice);
+#endif
+                               if (myISlice >= 0)
+                               {
+                                       tmpTracker->Initialize(cls->fSlaveTrackers[par->fFirstSlice + myISlice].Param());
+                                       tmpTracker->ReadEvent(&par->pClusterData[myISlice]);
+                                       tmpTracker->DoTracking();
+                                       tmpTracker->SetOutput(&par->pOutput[myISlice]);
+                                       pthread_mutex_lock((pthread_mutex_t*) cls->fHelperMemMutex);
+                                       tmpTracker->WriteOutputPrepare();
+                                       pthread_mutex_unlock((pthread_mutex_t*) cls->fHelperMemMutex);
+                                       tmpTracker->WriteOutput();
+
+                                       /*cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUSliceDataMemory((char*) new uint4[HLTCA_GPU_SLICE_DATA_MEMORY/sizeof(uint4)], (char*) new uint4[HLTCA_GPU_ROWS_MEMORY/sizeof(uint4)]);
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].ReadEvent(&par->pClusterData[myISlice]);
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracklets(HLTCA_GPU_MAX_TRACKLETS);
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersHits(par->pClusterData[myISlice].NumberOfClusters());
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracks(HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, cls->fConstructorBlockCount);
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemorySize()/sizeof( uint4 ) + 100]), par->pClusterData[myISlice].NumberOfClusters());
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTracksMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemorySize()/sizeof( uint4 ) + 100]), HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].DoTracking();
+                                       cls->WriteOutput(par->pOutput, par->fFirstSlice, myISlice, par->fNum + 1);
+                                       delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemory();
+                                       delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemory();
+                                       delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemory();*/
+                               }
+#ifdef HLTCA_STANDALONE
+                               if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d", par->fNum, myISlice);
+#endif
+                       }
+               }
+               else
+               {
+                       int mustRunSlice19 = 0;
+                       for (int i = par->fNum + 1;i < par->fSliceCount;i += cls->fNHelperThreads + 1)
+                       {
+                               //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Running, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
+                               if (par->fPhase)
+                               {
+                                       if (cls->fUseGlobalTracking)
+                                       {
+                                               int realSlice = i + 1;
+                                               if (realSlice % (fgkNSlices / 2) < 1) realSlice -= fgkNSlices / 2;
+
+                                               if (realSlice % (fgkNSlices / 2) != 1)
+                                               {
+                                                       cls->GlobalTracking(realSlice, par->fNum + 1, par);
+                                               }
+
+                                               if (realSlice == 19)
+                                               {
+                                                       mustRunSlice19 = 1;
+                                               }
+                                               else
+                                               {
+                                                       while (cls->fSliceLeftGlobalReady[realSlice] == 0 || cls->fSliceRightGlobalReady[realSlice] == 0)
+                                                       {
+                                                               if (par->fReset) goto ResetHelperThread;
+                                                       }
+                                                       cls->WriteOutput(par->pOutput, par->fFirstSlice, realSlice, par->fNum + 1);
+                                               }
+                                       }
+                                       else
+                                       {
+                                               while (cls->fSliceOutputReady < i)
+                                               {
+                                                       if (par->fReset) goto ResetHelperThread;
+                                               }
+                                               cls->WriteOutput(par->pOutput, par->fFirstSlice, i, par->fNum + 1);
+                                       }
+                               }
+                               else
+                               {
+                                       cls->ReadEvent(par->pClusterData, par->fFirstSlice, i, par->fNum + 1);
+                                       par->fDone = i + 1;
+                               }
+                               //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
+                       }
+                       if (mustRunSlice19)
+                       {
+                               while (cls->fSliceLeftGlobalReady[19] == 0 || cls->fSliceRightGlobalReady[19] == 0)
+                               {
+                                       if (par->fReset) goto ResetHelperThread;
+                               }
+                               cls->WriteOutput(par->pOutput, par->fFirstSlice, 19, par->fNum + 1);
+                       }
+               }
+ResetHelperThread:
+               cls->ResetThisHelperThread(par);
+       }
+#ifdef HLTCA_STANDALONE
+       if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d terminating", par->fNum);
+#endif
+       delete tmpTracker;
+       pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
+       pthread_exit(NULL);
+       return(NULL);
+}
+
+void AliHLTTPCCAGPUTrackerBase::ResetThisHelperThread(AliHLTTPCCAGPUTrackerBase::helperParam* par)
+{
+       if (par->fReset) HLTImportant("GPU Helper Thread %d reseting", par->fNum);
+       par->fReset = false;
+       pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
+}
+
+#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
+
+AliHLTTPCCAGPUTrackerBase::AliHLTTPCCAGPUTrackerBase() :
+fGpuTracker(NULL),
+fGPUMemory(NULL),
+fHostLockedMemory(NULL),
+fGPUMergerMemory(NULL),
+fGPUMergerHostMemory(NULL),
+fGPUMergerMaxMemory(0),
+fDebugLevel(0),
+fDebugMask(0xFFFFFFFF),
+fOutFile(NULL),
+fGPUMemSize(0),
+fSliceCount(HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT),
+fCudaDevice(0),
+fOutputControl(NULL),
+fThreadId(0),
+fCudaInitialized(0),
+fPPMode(0),
+fSelfheal(0),
+fConstructorBlockCount(30),
+selectorBlockCount(30),
+fNHelperThreads(HLTCA_GPU_DEFAULT_HELPER_THREADS),
+fHelperParams(NULL),
+fHelperMemMutex(NULL),
+fSliceOutputReady(0),
+fSliceGlobalMutexes(NULL),
+fNCPUTrackers(0),
+fNSlicesPerCPUTracker(0),
+fGlobalTracking(0),
+fUseGlobalTracking(0),
+fNSlaveThreads(0)
+{}
+
+AliHLTTPCCAGPUTrackerBase::~AliHLTTPCCAGPUTrackerBase()
+{
+}
+
+void AliHLTTPCCAGPUTrackerBase::ReleaseGlobalLock(void* sem)
+{
+       //Release the global named semaphore that locks GPU Initialization
+#ifdef R__WIN32
+       HANDLE* h = (HANDLE*) sem;
+       ReleaseSemaphore(*h, 1, NULL);
+       CloseHandle(*h);
+       delete h;
+#else
+       sem_t* pSem = (sem_t*) sem;
+       sem_post(pSem);
+       sem_unlink(SemLockName);
+#endif
+}
+
+int AliHLTTPCCAGPUTrackerBase::CheckMemorySizes(int sliceCount)
+{
+       //Check constants for correct memory sizes
+       if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
+       {
+               HLTError("Insufficiant Tracker Object Memory for %d slices", sliceCount);
+               return(1);
+       }
+
+       if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
+       {
+               HLTError("Insufficiant Common Memory");
+               return(1);
+       }
+
+       if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
+       {
+               HLTError("Insufficiant Row Memory");
+               return(1);
+       }
+
+       if (fDebugLevel >= 3)
+       {
+               HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", (int) sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, (int) (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize()), HLTCA_GPU_COMMON_MEMORY, (int) (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow)), HLTCA_GPU_ROWS_MEMORY);
+       }
+       return(0);
+}
+
+void AliHLTTPCCAGPUTrackerBase::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
+{
+       //Set Debug Level and Debug output File if applicable
+       fDebugLevel = dwLevel;
+       if (NewOutFile) fOutFile = NewOutFile;
+}
+
+int AliHLTTPCCAGPUTrackerBase::SetGPUTrackerOption(char* OptionName, int OptionValue)
+{
+       //Set a specific GPU Tracker Option
+       if (strcmp(OptionName, "PPMode") == 0)
+       {
+               fPPMode = OptionValue;
+       }
+       else if (strcmp(OptionName, "DebugMask") == 0)
+       {
+               fDebugMask = OptionValue;
+       }
+       else if (strcmp(OptionName, "HelperThreads") == 0)
+       {
+               fNHelperThreads = OptionValue;
+       }
+       else if (strcmp(OptionName, "CPUTrackers") == 0)
+       {
+               fNCPUTrackers = OptionValue;
+       }
+       else if (strcmp(OptionName, "SlicesPerCPUTracker") == 0)
+       {
+               fNSlicesPerCPUTracker = OptionValue;
+       }
+       else if (strcmp(OptionName, "GlobalTracking") == 0)
+       {
+               fGlobalTracking = OptionValue;
+       }
+       else
+       {
+               HLTError("Unknown Option: %s", OptionName);
+               return(1);
+       }
+
+       if (fNHelperThreads + fNCPUTrackers > fNSlaveThreads && fCudaInitialized)
+       {
+               HLTInfo("Insufficient Slave Threads available (%d), creating additional Slave Threads (%d+%d)\n", fNSlaveThreads, fNHelperThreads, fNCPUTrackers);
+               StopHelperThreads();
+               StartHelperThreads();
+       }
+
+       return(0);
+}
+
+#ifdef HLTCA_STANDALONE
+void AliHLTTPCCAGPUTrackerBase::StandalonePerfTime(int iSlice, int i)
+{
+       //Run Performance Query for timer i of slice iSlice
+       if (fDebugLevel >= 1)
+       {
+               AliHLTTPCCATracker::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
+       }
+}
+#else
+void AliHLTTPCCAGPUTrackerBase::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
+#endif
+
+int AliHLTTPCCAGPUTrackerBase::SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
+{
+       if (!fSelfheal)
+       {
+               ReleaseThreadContext();
+               return(1);
+       }
+       static bool selfHealing = false;
+       if (selfHealing)
+       {
+               HLTError("Selfhealing failed, giving up");
+               ReleaseThreadContext();
+               return(1);
+       }
+       else
+       {
+               HLTError("Unsolvable CUDA error occured, trying to reinitialize GPU");
+       }                       
+       selfHealing = true;
+       ExitGPU();
+       if (InitGPU(fSliceCount, fCudaDevice))
+       {
+               HLTError("Could not reinitialize CUDA device, disabling GPU tracker");
+               ExitGPU();
+               return(1);
+       }
+       HLTInfo("GPU tracker successfully reinitialized, restarting tracking");
+       int retVal = Reconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal);
+       selfHealing = false;
+       return(retVal);
+}
+
+void AliHLTTPCCAGPUTrackerBase::ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId)
+{
+       fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
+#ifdef HLTCA_GPU_TIME_PROFILE
+       unsigned long long int a, b;
+       AliHLTTPCCATracker::StandaloneQueryTime(&a);
+#endif
+       fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
+#ifdef HLTCA_GPU_TIME_PROFILE
+       AliHLTTPCCATracker::StandaloneQueryTime(&b);
+       HLTInfo("Read %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
+#endif
+}
+
+void AliHLTTPCCAGPUTrackerBase::WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId)
+{
+       if (fDebugLevel >= 3) {HLTDebug("GPU Tracker running WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);}
+       fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
+#ifdef HLTCA_GPU_TIME_PROFILE
+       unsigned long long int a, b;
+       AliHLTTPCCATracker::StandaloneQueryTime(&a);
+#endif
+       if (fNHelperThreads) pthread_mutex_lock((pthread_mutex_t*) fHelperMemMutex);
+       fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare();
+       if (fNHelperThreads) pthread_mutex_unlock((pthread_mutex_t*) fHelperMemMutex);
+       fSlaveTrackers[firstSlice + iSlice].WriteOutput();
+#ifdef HLTCA_GPU_TIME_PROFILE
+       AliHLTTPCCATracker::StandaloneQueryTime(&b);
+       HLTInfo("Write %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
+#endif
+       if (fDebugLevel >= 3) {HLTDebug("GPU Tracker finished WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);}
+}
+
+int AliHLTTPCCAGPUTrackerBase::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
+{
+       //Initialize Slice Tracker Parameter for a slave tracker
+       fSlaveTrackers[iSlice].Initialize(param);
+       if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
+       {
+               HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
+               return(1);
+       }
+       return(0);
+}
+
+void AliHLTTPCCAGPUTrackerBase::ResetHelperThreads(int helpers)
+{
+       HLTImportant("Error occurred, GPU tracker helper threads will be reset (Number of threads %d/%d)", fNHelperThreads, fNCPUTrackers);
+       SynchronizeGPU();
+       ReleaseThreadContext();
+       for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
+       {
+               fHelperParams[i].fReset = true;
+               if (helpers || i >= fNHelperThreads) pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
+       }
+       HLTImportant("GPU Tracker helper threads have ben reset");
+}
+
+int AliHLTTPCCAGPUTrackerBase::StartHelperThreads()
+{
+       int nThreads = fNHelperThreads + fNCPUTrackers;
+       if (nThreads)
+       {
+               fHelperParams = new helperParam[nThreads];
+               if (fHelperParams == NULL)
+               {
+                       HLTError("Memory allocation error");
+                       ExitGPU();
+                       return(1);
+               }       
+               for (int i = 0;i < nThreads;i++)
+               {
+                       fHelperParams[i].fCls = this;
+                       fHelperParams[i].fTerminate = false;
+                       fHelperParams[i].fReset = false;
+                       fHelperParams[i].fNum = i;
+                       fHelperParams[i].fMutex = malloc(2 * sizeof(pthread_mutex_t));
+                       if (fHelperParams[i].fMutex == NULL)
+                       {
+                               HLTError("Memory allocation error");
+                               ExitGPU();
+                               return(1);
+                       }
+                       for (int j = 0;j < 2;j++)
+                       {
+                               if (pthread_mutex_init(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j], NULL))
+                               {
+                                       HLTError("Error creating pthread mutex");
+                                       ExitGPU();
+                                       return(1);
+                               }
+
+                               pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
+                       }
+                       fHelperParams[i].fThreadId = (void*) malloc(sizeof(pthread_t));
+
+                       if (pthread_create((pthread_t*) fHelperParams[i].fThreadId, NULL, helperWrapper, &fHelperParams[i]))
+                       {
+                               HLTError("Error starting slave thread");
+                               ExitGPU();
+                               return(1);
+                       }
+               }
+       }
+       fNSlaveThreads = nThreads;
+       return(0);
+}
+
+int AliHLTTPCCAGPUTrackerBase::StopHelperThreads()
+{
+       if (fNSlaveThreads)
+       {
+               for (int i = 0;i < fNSlaveThreads;i++)
+               {
+                       fHelperParams[i].fTerminate = true;
+                       if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]))
+                       {
+                               HLTError("Error unlocking mutex to terminate slave");
+                               return(1);
+                       }
+                       if (pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]))
+                       {
+                               HLTError("Error locking mutex");
+                               return(1);
+                       }
+                       if (pthread_join( *((pthread_t*) fHelperParams[i].fThreadId), NULL))
+                       {
+                               HLTError("Error waiting for thread to terminate");
+                               return(1);
+                       }
+                       free(fHelperParams[i].fThreadId);
+                       for (int j = 0;j < 2;j++)
+                       {
+                               if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]))
+                               {
+                                       HLTError("Error unlocking mutex before destroying");
+                                       return(1);
+                               }
+                               pthread_mutex_destroy(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
+                       }
+                       free(fHelperParams[i].fMutex);
+               }
+               delete[] fHelperParams;
+       }
+       fNSlaveThreads = 0;
+       return(0);
+}
+
+void AliHLTTPCCAGPUTrackerBase::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
+{
+       //Set Output Control Pointers
+       fOutputControl = val;
+       for (int i = 0;i < fgkNSlices;i++)
+       {
+               fSlaveTrackers[i].SetOutputControl(val);
+       }
+}
+
+int AliHLTTPCCAGPUTrackerBase::GetThread()
+{
+       //Get Thread ID
+#ifdef R__WIN32
+       return((int) (size_t) GetCurrentThread());
+#else
+       return((int) syscall (SYS_gettid));
+#endif
+}
+
+unsigned long long int* AliHLTTPCCAGPUTrackerBase::PerfTimer(int iSlice, unsigned int i)
+{
+       //Returns pointer to PerfTimer i of slice iSlice
+       return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);
+}
+
+const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerBase::OutputControl() const
+{
+       //Return Pointer to Output Control Structure
+       return fOutputControl;
+}
+
+int AliHLTTPCCAGPUTrackerBase::GetSliceCount() const
+{
+       //Return max slice count processable
+       return(fSliceCount);
+}
+
+char* AliHLTTPCCAGPUTrackerBase::MergerBaseMemory()
+{
+       return(alignPointer((char*) fGPUMergerHostMemory, 1024 * 1024));
+}
+
+int AliHLTTPCCAGPUTrackerBase::IsInitialized()
+{
+       return(fCudaInitialized);
+}
+
+int AliHLTTPCCAGPUTrackerBase::InitGPU(int sliceCount, int forceDeviceID)
+{
+#if defined(HLTCA_STANDALONE) & !defined(_WIN32)
+       cpu_set_t mask;
+       CPU_ZERO(&mask);
+       CPU_SET(0, &mask);
+       //sched_setaffinity(0, sizeof(mask), &mask);
+#endif
+
+       if (sliceCount == -1) sliceCount = fSliceCount;
+
+       if (CheckMemorySizes(sliceCount)) return(1);
+
+#ifdef R__WIN32
+       HANDLE* semLock = new HANDLE;
+       *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
+       if (*semLock == NULL)
+       {
+               HLTError("Error creating GPUInit Semaphore");
+               return(1);
+       }
+       WaitForSingleObject(*semLock, INFINITE);
+#else
+       sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
+       if (semLock == SEM_FAILED)
+       {
+               HLTError("Error creating GPUInit Semaphore");
+               return(1);
+       }
+       timespec semtime;
+       clock_gettime(CLOCK_REALTIME, &semtime);
+       semtime.tv_sec += 10;
+       while (sem_timedwait(semLock, &semtime) != 0)
+       {
+               HLTError("Global Lock for GPU initialisation was not released for 10 seconds, assuming another thread died");
+               HLTWarning("Resetting the global lock");
+               sem_post(semLock);
+       }
+#endif
+
+       fThreadId = GetThread();
+
+       fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
+
+#ifdef HLTCA_GPU_MERGER
+       fGPUMergerMaxMemory = 2000000 * 5 * sizeof(float);
+       fGPUMemSize += fGPUMergerMaxMemory;
+#endif
+
+       int retVal = InitGPU_Runtime(sliceCount, forceDeviceID);
+       ReleaseGlobalLock(semLock);
+
+       if (retVal)
+       {
+               HLTImportant("GPU Tracker initialization failed");
+               return(1);
+       }
+
+       fSliceCount = sliceCount;
+       //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
+       fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
+
+       for (int i = 0;i < fgkNSlices;i++)
+       {
+               fSlaveTrackers[i].SetGPUTracker();
+               fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
+               fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
+       }
+
+       if (StartHelperThreads()) return(1);
+
+       fHelperMemMutex = malloc(sizeof(pthread_mutex_t));
+       if (fHelperMemMutex == NULL)
+       {
+               HLTError("Memory allocation error");
+               ExitGPU_Runtime();
+               return(1);
+       }
+
+       if (pthread_mutex_init((pthread_mutex_t*) fHelperMemMutex, NULL))
+       {
+               HLTError("Error creating pthread mutex");
+               ExitGPU_Runtime();
+               free(fHelperMemMutex);
+               return(1);
+       }
+
+       fSliceGlobalMutexes = malloc(sizeof(pthread_mutex_t) * fgkNSlices);
+       if (fSliceGlobalMutexes == NULL)
+       {
+               HLTError("Memory allocation error");
+               ExitGPU_Runtime();
+               return(1);
+       }
+       for (int i = 0;i < fgkNSlices;i++)
+       {
+               if (pthread_mutex_init(&((pthread_mutex_t*) fSliceGlobalMutexes)[i], NULL))
+               {
+                       HLTError("Error creating pthread mutex");
+                       ExitGPU_Runtime();
+                       return(1);
+               }
+       }
+
+       fCudaInitialized = 1;
+       HLTImportant("GPU Tracker initialization successfull");
+
+#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
+       if (fDebugLevel < 2 && 0)
+       {
+               //Do one initial run for Benchmark reasons
+               const int useDebugLevel = fDebugLevel;
+               fDebugLevel = 0;
+               AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
+
+               std::ifstream fin;
+
+               AliHLTTPCCAParam tmpParam;
+               AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
+
+               fin.open("events/settings.dump");
+               int tmpCount;
+               fin >> tmpCount;
+               for (int i = 0;i < sliceCount;i++)
+               {
+                       fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
+                       tmpParam.ReadSettings(fin);
+                       InitializeSliceParam(i, tmpParam);
+               }
+               fin.close();
+
+               fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
+               for (int i = 0;i < sliceCount;i++)
+               {
+                       tmpCluster[i].StartReading(i, 0);
+                       tmpCluster[i].ReadEvent(fin);
+               }
+               fin.close();
+
+               AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
+               memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
+
+               Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
+               for (int i = 0;i < sliceCount;i++)
+               {
+                       free(tmpOutput[i]);
+                       tmpOutput[i] = NULL;
+                       fSlaveTrackers[i].SetOutputControl(NULL);
+               }
+               delete[] tmpOutput;
+               delete[] tmpCluster;
+               fDebugLevel = useDebugLevel;
+       }
+#endif
+
+       return(retVal);
+}
+
+int AliHLTTPCCAGPUTrackerBase::ExitGPU()
+{
+       if (StopHelperThreads()) return(1);
+       pthread_mutex_destroy((pthread_mutex_t*) fHelperMemMutex);
+       free(fHelperMemMutex);
+
+       for (int i = 0;i < fgkNSlices;i++) pthread_mutex_destroy(&((pthread_mutex_t*) fSliceGlobalMutexes)[i]);
+       free(fSliceGlobalMutexes);
+
+       return(ExitGPU_Runtime());
+}
+
+int AliHLTTPCCAGPUTrackerBase::Reconstruct_Base_FinishSlices(AliHLTTPCCASliceOutput** pOutput, int& iSlice, int& firstSlice)
+{
+       fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks;
+       fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits;
+       if (fUseGlobalTracking) fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracklets = 1;
+
+       if (fDebugLevel >= 3) HLTInfo("Data ready for slice %d, helper thread %d", iSlice, iSlice % (fNHelperThreads + 1));
+       fSliceOutputReady = iSlice;
+
+       if (fUseGlobalTracking)
+       {
+               if (iSlice % (fgkNSlices / 2) == 2)
+               {
+                       int tmpId = iSlice % (fgkNSlices / 2) - 1;
+                       if (iSlice >= fgkNSlices / 2) tmpId += fgkNSlices / 2;
+                       GlobalTracking(tmpId, 0, NULL);
+                       fGlobalTrackingDone[tmpId] = 1;
+               }
+               for (int tmpSlice3a = 0;tmpSlice3a < iSlice;tmpSlice3a += fNHelperThreads + 1)
+               {
+                       int tmpSlice3 = tmpSlice3a + 1;
+                       if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
+                       if (tmpSlice3 >= iSlice) break;
+
+                       int sliceLeft = (tmpSlice3 + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
+                       int sliceRight = (tmpSlice3 + 1) % (fgkNSlices / 2);
+                       if (tmpSlice3 >= fgkNSlices / 2)
+                       {
+                               sliceLeft += fgkNSlices / 2;
+                               sliceRight += fgkNSlices / 2;
+                       }
+
+                       if (tmpSlice3 % (fgkNSlices / 2) != 1 && fGlobalTrackingDone[tmpSlice3] == 0 && sliceLeft < iSlice && sliceRight < iSlice)
+                       {
+                               GlobalTracking(tmpSlice3, 0, NULL);
+                               fGlobalTrackingDone[tmpSlice3] = 1;
+                       }
+
+                       if (fWriteOutputDone[tmpSlice3] == 0 && fSliceLeftGlobalReady[tmpSlice3] && fSliceRightGlobalReady[tmpSlice3])
+                       {
+                               WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
+                               fWriteOutputDone[tmpSlice3] = 1;
+                       }
+               }
+       }
+       else
+       {
+               if (iSlice % (fNHelperThreads + 1) == 0)
+               {
+                       WriteOutput(pOutput, firstSlice, iSlice, 0);
+               }
+       }
+       return(0);
+}
+
+int AliHLTTPCCAGPUTrackerBase::Reconstruct_Base_Finalize(AliHLTTPCCASliceOutput** pOutput, char*& tmpMemoryGlobalTracking, int& firstSlice)
+{
+       if (fUseGlobalTracking)
+       {
+               for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
+               {
+                       int tmpSlice3 = (tmpSlice3a + 1);
+                       if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
+                       if (fGlobalTrackingDone[tmpSlice3] == 0) GlobalTracking(tmpSlice3, 0, NULL);
+               }
+               for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
+               {
+                       int tmpSlice3 = (tmpSlice3a + 1);
+                       if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
+                       if (fWriteOutputDone[tmpSlice3] == 0)
+                       {
+                               while (fSliceLeftGlobalReady[tmpSlice3] == 0 || fSliceRightGlobalReady[tmpSlice3] == 0);
+                               WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
+                       }
+               }
+       }
+
+       for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
+       {
+               pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
+       }
+
+       if (fUseGlobalTracking)
+       {
+               free(tmpMemoryGlobalTracking);
+               if (fDebugLevel >= 3)
+               {
+                       for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
+                       {
+                               HLTDebug("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits);
+                       }
+               }
+       }
+
+       StandalonePerfTime(firstSlice, 10);
+
+       if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
+       return(0);
+}
+
+int AliHLTTPCCAGPUTrackerBase::Reconstruct_Base_StartGlobal(AliHLTTPCCASliceOutput** pOutput, char*& tmpMemoryGlobalTracking)
+{
+       if (fUseGlobalTracking)
+       {
+               int tmpmemSize = sizeof(AliHLTTPCCATracklet)
+#ifdef EXTERN_ROW_HITS
+               + HLTCA_ROW_COUNT * sizeof(int)
+#endif
+               + 16;
+               tmpMemoryGlobalTracking = (char*) malloc(tmpmemSize * fgkNSlices);
+               for (int i = 0;i < fgkNSlices;i++)
+               {
+                       fSliceLeftGlobalReady[i] = 0;
+                       fSliceRightGlobalReady[i] = 0;
+               }
+               memset(fGlobalTrackingDone, 0, fgkNSlices);
+               memset(fWriteOutputDone, 0, fgkNSlices);
+
+               for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
+               {
+                       fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(tmpMemoryGlobalTracking + (tmpmemSize * iSlice), 1, fConstructorBlockCount);
+               }
+       }
+       for (int i = 0;i < fNHelperThreads;i++)
+       {
+               fHelperParams[i].fPhase = 1;
+               fHelperParams[i].pOutput = pOutput;
+               pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
+       }
+       return(0);
+}
+
+int AliHLTTPCCAGPUTrackerBase::Reconstruct_Base_SliceInit(AliHLTTPCCAClusterData* pClusterData, int& iSlice, int& firstSlice)
+{
+       StandalonePerfTime(firstSlice + iSlice, 0);
+
+       //Initialize GPU Slave Tracker
+       if (fDebugLevel >= 3) HLTInfo("Creating Slice Data (Slice %d)", iSlice);
+       if (iSlice % (fNHelperThreads + 1) == 0)
+       {
+               ReadEvent(pClusterData, firstSlice, iSlice, 0);
+       }
+       else
+       {
+               if (fDebugLevel >= 3) HLTInfo("Waiting for helper thread %d", iSlice % (fNHelperThreads + 1) - 1);
+               while(fHelperParams[iSlice % (fNHelperThreads + 1) - 1].fDone < iSlice);
+       }
+
+       if (fDebugLevel >= 4)
+       {
+#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
+               *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl;
+#endif
+               if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);
+       }
+
+       if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY RANDOM_ERROR)
+       {
+               HLTError("Insufficiant Slice Data Memory");
+               ResetHelperThreads(1);
+               return(1);
+       }
+
+       if (fDebugLevel >= 3)
+       {
+               HLTInfo("GPU Slice Data Memory Used: %d/%d", (int) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY);
+       }
+       return(0);
+}
+
+int AliHLTTPCCAGPUTrackerBase::Reconstruct_Base_Init(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int& firstSlice, int& sliceCountLocal)
+{
+       if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
+
+       if (!fCudaInitialized)
+       {
+               HLTError("GPUTracker not initialized");
+               return(1);
+       }
+       if (sliceCountLocal > fSliceCount)
+       {
+               HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
+               return(1);
+       }
+       if (fThreadId != GetThread())
+       {
+               HLTWarning("CUDA thread changed, migrating context, Previous Thread: %d, New Thread: %d", fThreadId, GetThread());
+               fThreadId = GetThread();
+       }
+
+       if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
+
+       if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
+       {
+               HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
+               return(1);
+       }
+       
+       ActivateThreadContext();
+       if (fPPMode)
+       {
+               int retVal = ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal);
+               ReleaseThreadContext();
+               return(retVal);
+       }
+
+       for (int i = fNHelperThreads;i < fNCPUTrackers + fNHelperThreads;i++)
+       {
+               fHelperParams[i].CPUTracker = 1;
+               fHelperParams[i].pClusterData = pClusterData;
+               fHelperParams[i].pOutput = pOutput;
+               fHelperParams[i].fSliceCount = sliceCountLocal;
+               fHelperParams[i].fFirstSlice = firstSlice;
+               pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
+       }
+       sliceCountLocal -= fNCPUTrackers * fNSlicesPerCPUTracker;
+       if (sliceCountLocal < 0) sliceCountLocal = 0;
+
+       fUseGlobalTracking = fGlobalTracking && sliceCountLocal == fgkNSlices;
+
+       memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
+
+       if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
+
+#ifdef HLTCA_GPU_TIME_PROFILE
+       AliHLTTPCCATracker::StandaloneQueryFreq(&fProfTimeC);
+       AliHLTTPCCATracker::StandaloneQueryTime(&fProfTimeD);
+#endif
+
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               //Make this a GPU Tracker
+               fGpuTracker[iSlice].SetGPUTracker();
+               fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
+               fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
+               fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUMem = (char*) fGPUMemory;
+
+               //Set Pointers to GPU Memory
+               char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
+
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);
+
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);
+
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);
+
+               if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY RANDOM_ERROR)
+               {
+                       HLTError("Insufficiant Track Memory");
+                       ResetHelperThreads(0);
+                       return(1);
+               }
+
+               if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY RANDOM_ERROR)
+               {
+                       HLTError("Insufficiant Global Memory");
+                       ResetHelperThreads(0);
+                       return(1);
+               }
+
+               if (fDebugLevel >= 3)
+               {
+                       HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", (int) (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice)), HLTCA_GPU_GLOBAL_MEMORY, (int) fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY);
+               }
+
+               //Initialize Startup Constants
+               *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
+               *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
+               *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = sliceCountLocal > fConstructorBlockCount ? (iSlice < fConstructorBlockCount) : fConstructorBlockCount * (iSlice + 1) / sliceCountLocal - fConstructorBlockCount * (iSlice) / sliceCountLocal;
+               if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
+               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
+               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fNextTracklet = (fConstructorBlockCount / sliceCountLocal + (fConstructorBlockCount % sliceCountLocal > iSlice)) * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;
+               fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
+       }
+
+       for (int i = 0;i < fNHelperThreads;i++)
+       {
+               fHelperParams[i].CPUTracker = 0;
+               fHelperParams[i].fDone = 0;
+               fHelperParams[i].fPhase = 0;
+               fHelperParams[i].pClusterData = pClusterData;
+               fHelperParams[i].fSliceCount = sliceCountLocal;
+               fHelperParams[i].fFirstSlice = firstSlice;
+               pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
+       }
+
+       return(0);
+}
diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.h
new file mode 100644 (file)
index 0000000..5c95b0c
--- /dev/null
@@ -0,0 +1,207 @@
+//-*- Mode: C++ -*-
+// $Id$
+
+// ************************************************************************
+// This file is property of and copyright by the ALICE HLT Project        *
+// ALICE Experiment at CERN, All rights reserved.                         *
+// See cxx source for full Copyright notice                               *
+//                                                                        *
+//*************************************************************************
+
+//  @file   AliHLTTPCCAGPUTrackerBase.h
+//  @author David Rohr, Sergey Gorbunov
+//  @date   
+//  @brief  TPC CA Tracker for the NVIDIA GPU
+//  @note 
+
+#ifndef ALIHLTTPCCAGPUTRACKERBASE_H
+#define ALIHLTTPCCAGPUTRACKERBASE_H
+
+#define HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT 36
+
+#include "AliHLTTPCCAGPUTracker.h"
+#include "AliHLTTPCCADef.h"
+#include "AliHLTTPCCATracker.h"
+#include "AliHLTLogging.h"
+#include "AliHLTTPCCASliceOutput.h"
+
+#ifdef __CINT__
+typedef int cudaError_t
+#elif defined(R__WIN32)
+#include "../cmodules/pthread_mutex_win32_wrapper.h"
+#else
+#include <pthread.h>
+#include <errno.h>
+#endif
+
+#define RANDOM_ERROR
+//#define RANDOM_ERROR || rand() % 500 == 1
+
+MEM_CLASS_PRE() class AliHLTTPCCARow;
+
+class AliHLTTPCCAGPUTrackerBase : public AliHLTTPCCAGPUTracker, public AliHLTLogging
+{
+       friend void* helperWrapper(void*);
+public:
+       AliHLTTPCCAGPUTrackerBase();
+       virtual ~AliHLTTPCCAGPUTrackerBase();
+
+       virtual int InitGPU(int sliceCount = -1, int forceDeviceID = -1);
+       virtual int InitGPU_Runtime(int sliceCount = -1, int forceDeviceID = -1) = 0;
+       virtual int IsInitialized();
+       virtual int Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1) = 0;
+       int SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
+       virtual int ExitGPU();
+       virtual int ExitGPU_Runtime() = 0;
+
+       virtual void SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile = NULL);
+       virtual int SetGPUTrackerOption(char* OptionName, int OptionValue);
+
+       virtual unsigned long long int* PerfTimer(int iSlice, unsigned int i);
+
+       virtual int InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param);
+       virtual void SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val);
+
+       virtual const AliHLTTPCCASliceOutput::outputControlStruct* OutputControl() const;
+       virtual int GetSliceCount() const;
+
+       virtual int RefitMergedTracks(AliHLTTPCGMMerger* Merger) = 0;
+       virtual char* MergerBaseMemory();
+
+protected:
+       virtual void ActivateThreadContext() = 0;
+       virtual void ReleaseThreadContext() = 0;
+       virtual void SynchronizeGPU() = 0;
+
+       struct helperParam
+       {
+               void* fThreadId;
+               AliHLTTPCCAGPUTrackerBase* fCls;
+               int fNum;
+               int fSliceCount;
+               AliHLTTPCCAClusterData* pClusterData;
+               AliHLTTPCCASliceOutput** pOutput;
+               int fFirstSlice;
+               void* fMutex;
+               bool fTerminate;
+               int fPhase;
+               int CPUTracker;
+               volatile int fDone;
+               volatile bool fReset;
+       };
+
+       static void* RowMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + iSlice * sizeof(AliHLTTPCCARow) * (HLTCA_ROW_COUNT + 1) ); }
+       static void* CommonMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + iSlice * AliHLTTPCCATracker::CommonMemorySize() ); }
+       static void* SliceDataMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + iSlice * HLTCA_GPU_SLICE_DATA_MEMORY ); }
+       void* GlobalMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_GLOBAL_MEMORY ); }
+       void* TracksMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_TRACKS_MEMORY ); }
+       void* TrackerMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + iSlice * sizeof(AliHLTTPCCATracker) ); }
+
+       int Reconstruct_Base_Init(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int& firstSlice, int& sliceCountLocal);
+       int Reconstruct_Base_SliceInit(AliHLTTPCCAClusterData* pClusterData, int& iSlice, int& firstSlice);
+       int Reconstruct_Base_StartGlobal(AliHLTTPCCASliceOutput** pOutput, char*& tmpMemoryGlobalTracking);
+       int Reconstruct_Base_FinishSlices(AliHLTTPCCASliceOutput** pOutput, int& iSlice, int& firstSlice);
+       int Reconstruct_Base_Finalize(AliHLTTPCCASliceOutput** pOutput, char*& tmpMemoryGlobalTracking, int& firstSlice);
+       virtual int ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1) = 0;
+       
+       void ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId);
+       void WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId);
+       int GlobalTracking(int iSlice, int threadId, helperParam* hParam);
+
+       int StartHelperThreads();
+       int StopHelperThreads();
+       void ResetHelperThreads(int helpers);
+       void ResetThisHelperThread(AliHLTTPCCAGPUTrackerBase::helperParam* par);
+
+       int GetThread();
+       void ReleaseGlobalLock(void* sem);
+       int CheckMemorySizes(int sliceCount);
+
+       virtual int GPUSync(char* state = "UNKNOWN", int stream = -1, int slice = 0) = 0;
+       template <class T> T* alignPointer(T* ptr, int alignment);
+       void StandalonePerfTime(int iSlice, int i);
+#define GPUFailedMsg(x) GPUFailedMsgA(x, __FILE__, __LINE__)
+       
+       static void* helperWrapper(void*);
+
+       AliHLTTPCCATracker *fGpuTracker; //Tracker Objects that will be used on the GPU
+       void* fGPUMemory; //Pointer to GPU Memory Base Adress
+       void* fHostLockedMemory; //Pointer to Base Adress of Page Locked Host Memory for DMA Transfer
+
+       void* fGPUMergerMemory;
+       void* fGPUMergerHostMemory;
+       int fGPUMergerMaxMemory;
+
+       int fDebugLevel;                        //Debug Level for GPU Tracker
+       unsigned int fDebugMask;        //Mask which Debug Data is written to file
+       std::ostream* fOutFile;         //Debug Output Stream Pointer
+       unsigned long long int fGPUMemSize;     //Memory Size to allocate on GPU
+
+       int fSliceCount; //Maximum Number of Slices this GPU tracker can process in parallel
+       int fCudaDevice; //CUDA device used by GPU tracker
+
+       static const int fgkNSlices = 36; //Number of Slices in Alice
+       AliHLTTPCCATracker fSlaveTrackers[fgkNSlices]; //CPU Slave Trackers for Initialization and Output
+
+       AliHLTTPCCASliceOutput::outputControlStruct* fOutputControl; //Output Control Structure
+       
+       int fThreadId; //Thread ID that is valid for the local CUDA context
+       int fCudaInitialized; //Flag if CUDA is initialized
+
+       int fPPMode; //Flag if GPU tracker runs in PP Mode
+       int fSelfheal; //Reinitialize GPU on failure
+
+       int fConstructorBlockCount; //GPU blocks used in Tracklet Constructor
+       int selectorBlockCount; //GPU blocks used in Tracklet Selector
+       
+#ifdef HLTCA_GPU_TIME_PROFILE
+       unsigned long long int fProfTimeC, fProfTimeD; //Timing
+#endif
+
+       int fNHelperThreads; //Number of helper threads for post/preprocessing
+       helperParam* fHelperParams; //Control Struct for helper threads
+       void* fHelperMemMutex;
+       
+#ifdef __ROOT__
+#define volatile
+#endif
+       volatile int fSliceOutputReady;
+       volatile char fSliceLeftGlobalReady[fgkNSlices];
+       volatile char fSliceRightGlobalReady[fgkNSlices];
+#ifdef __ROOT__
+#undef volatile
+#endif
+       void* fSliceGlobalMutexes;
+       char fGlobalTrackingDone[fgkNSlices];
+       char fWriteOutputDone[fgkNSlices];
+
+       int fNCPUTrackers; //Number of CPU trackers to use
+       int fNSlicesPerCPUTracker; //Number of slices processed by each CPU tracker
+
+       int fGlobalTracking; //Use Global Tracking
+       int fUseGlobalTracking; 
+
+       int fNSlaveThreads;     //Number of slave threads currently active
+
+       // disable copy
+       AliHLTTPCCAGPUTrackerBase( const AliHLTTPCCAGPUTrackerBase& );
+       AliHLTTPCCAGPUTrackerBase &operator=( const AliHLTTPCCAGPUTrackerBase& );
+
+       ClassDef( AliHLTTPCCAGPUTrackerBase, 0 )
+};
+
+template <class T> inline T* AliHLTTPCCAGPUTrackerBase::alignPointer(T* ptr, int alignment)
+{
+       //Macro to align Pointers.
+       //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
+       //(As long as every single data structure is <= 1 MB)
+
+       size_t adr = (size_t) ptr;
+       if (adr % alignment)
+       {
+               adr += alignment - (adr % alignment);
+       }
+       return((T*) adr);
+}
+
+#endif
diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerCommon.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerCommon.h
new file mode 100644 (file)
index 0000000..3467831
--- /dev/null
@@ -0,0 +1,28 @@
+//Disable assertions since they produce errors in GPU Code
+#ifdef assert
+#undef assert
+#endif
+#define assert(param)
+
+#ifdef R__WIN32
+#else
+#include <sys/syscall.h>
+#include <semaphore.h>
+#include <fcntl.h>
+#endif
+#include "AliHLTTPCCADef.h"
+#include "AliHLTTPCCAGPUConfig.h"
+
+#if defined(HLTCA_STANDALONE) & !defined(_WIN32)
+#include <sched.h>
+#endif
+
+#include <iostream>
+#include <fstream>
+
+#include "MemoryAssignmentHelpers.h"
+
+#ifndef HLTCA_STANDALONE
+#include "AliHLTDefinitions.h"
+#include "AliHLTSystem.h"
+#endif
index 7b2874c..0ce1be6 100755 (executable)
 //                                                                          *
 //***************************************************************************
 
-#define HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT 36
 #define FERMI
 #include "AliHLTTPCCAGPUTrackerNVCC.h"
+#include "AliHLTTPCCAGPUTrackerCommon.h"
+#define get_global_id(dim) (blockIdx.x * blockDim.x + threadIdx.x)
+#define get_global_size(dim) (blockDim.x * gridDim.x)
+#define get_num_groups(dim) (gridDim.x)
+#define get_local_id(dim) (threadIdx.x)
+#define get_local_size(dim) (blockDim.x)
+#define get_group_id(dim) (blockIdx.x)
 
-#ifdef HLTCA_GPUCODE
 #include <cuda.h>
 #include <sm_11_atomic_functions.h>
 #include <sm_12_atomic_functions.h>
-#endif
-
-#ifdef R__WIN32
-#else
-#include <sys/syscall.h>
-#include <semaphore.h>
-#include <fcntl.h>
-#endif
-#include "AliHLTTPCCADef.h"
-#include "AliHLTTPCCAGPUConfig.h"
-
-#if defined(HLTCA_STANDALONE) & !defined(_WIN32)
-#include <sched.h>
-#endif
-
-#include <iostream>
-#include <fstream>
-
-//Disable assertions since they produce errors in GPU Code
-#ifdef assert
-#undef assert
-#endif
-#define assert(param)
 
 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
 #ifdef HLTCA_GPU_TEXTURE_FETCH
@@ -80,213 +62,9 @@ texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
 #include "AliHLTTPCGMTrackParam.cxx"
 #endif
 
-#include "MemoryAssignmentHelpers.h"
-
-#ifndef HLTCA_STANDALONE
-#include "AliHLTDefinitions.h"
-#include "AliHLTSystem.h"
-#endif
-
-#define RANDOM_ERROR
-//#define RANDOM_ERROR || rand() % 500 == 1
-
 ClassImp( AliHLTTPCCAGPUTrackerNVCC )
 
-int AliHLTTPCCAGPUTrackerNVCC::GlobalTracking(int iSlice, int threadId, AliHLTTPCCAGPUTrackerNVCC::helperParam* hParam)
-{
-       if (fDebugLevel >= 3) printf("GPU Tracker running Global Tracking for slice %d on thread %d\n", iSlice, threadId);
-
-       int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
-       int sliceRight = (iSlice + 1) % (fgkNSlices / 2);
-       if (iSlice >= fgkNSlices / 2)
-       {
-               sliceLeft += fgkNSlices / 2;
-               sliceRight += fgkNSlices / 2;
-       }
-       while (fSliceOutputReady < iSlice || fSliceOutputReady < sliceLeft || fSliceOutputReady < sliceRight)
-       {
-               if (hParam != NULL && hParam->fReset) return(1);
-       }
-
-       pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
-       pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
-       fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS);
-       pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
-       pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
-
-       fSliceLeftGlobalReady[sliceLeft] = 1;
-       fSliceRightGlobalReady[sliceRight] = 1;
-       if (fDebugLevel >= 3) printf("GPU Tracker finished Global Tracking for slice %d on thread %d\n", iSlice, threadId);
-       return(0);
-}
-
-void* AliHLTTPCCAGPUTrackerNVCC::helperWrapper(void* arg)
-{
-       AliHLTTPCCAGPUTrackerNVCC::helperParam* par = (AliHLTTPCCAGPUTrackerNVCC::helperParam*) arg;
-       AliHLTTPCCAGPUTrackerNVCC* cls = par->fCls;
-
-       AliHLTTPCCATracker* tmpTracker = new AliHLTTPCCATracker;
-
-#ifdef HLTCA_STANDALONE
-       if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d starting", par->fNum);
-#endif
-
-#if defined(HLTCA_STANDALONE) & !defined(_WIN32)
-       cpu_set_t mask;
-       CPU_ZERO(&mask);
-       CPU_SET(par->fNum * 2 + 2, &mask);
-       //sched_setaffinity(0, sizeof(mask), &mask);
-#endif
-
-       while(pthread_mutex_lock(&((pthread_mutex_t*) par->fMutex)[0]) == 0 && par->fTerminate == false)
-       {
-               if (par->CPUTracker)
-               {
-                       for (int i = 0;i < cls->fNSlicesPerCPUTracker;i++)
-                       {
-                               int myISlice = cls->fSliceCount - cls->fNCPUTrackers * cls->fNSlicesPerCPUTracker + (par->fNum - cls->fNHelperThreads) * cls->fNSlicesPerCPUTracker + i;
-#ifdef HLTCA_STANDALONE
-                               if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Doing full CPU tracking, Slice %d", par->fNum, myISlice);
-#endif
-                               if (myISlice >= 0)
-                               {
-                                       tmpTracker->Initialize(cls->fSlaveTrackers[par->fFirstSlice + myISlice].Param());
-                                       tmpTracker->ReadEvent(&par->pClusterData[myISlice]);
-                                       tmpTracker->DoTracking();
-                                       tmpTracker->SetOutput(&par->pOutput[myISlice]);
-                                       pthread_mutex_lock((pthread_mutex_t*) cls->fHelperMemMutex);
-                                       tmpTracker->WriteOutputPrepare();
-                                       pthread_mutex_unlock((pthread_mutex_t*) cls->fHelperMemMutex);
-                                       tmpTracker->WriteOutput();
-
-                                       /*cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUSliceDataMemory((char*) new uint4[HLTCA_GPU_SLICE_DATA_MEMORY/sizeof(uint4)], (char*) new uint4[HLTCA_GPU_ROWS_MEMORY/sizeof(uint4)]);
-                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].ReadEvent(&par->pClusterData[myISlice]);
-                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracklets(HLTCA_GPU_MAX_TRACKLETS);
-                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersHits(par->pClusterData[myISlice].NumberOfClusters());
-                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracks(HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
-                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, cls->fConstructorBlockCount);
-                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemorySize()/sizeof( uint4 ) + 100]), par->pClusterData[myISlice].NumberOfClusters());
-                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTracksMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemorySize()/sizeof( uint4 ) + 100]), HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
-                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].DoTracking();
-                                       cls->WriteOutput(par->pOutput, par->fFirstSlice, myISlice, par->fNum + 1);
-                                       delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemory();
-                                       delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemory();
-                                       delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemory();*/
-                               }
-#ifdef HLTCA_STANDALONE
-                               if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d", par->fNum, myISlice);
-#endif
-                       }
-               }
-               else
-               {
-                       int mustRunSlice19 = 0;
-                       for (int i = par->fNum + 1;i < par->fSliceCount;i += cls->fNHelperThreads + 1)
-                       {
-                               //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Running, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
-                               if (par->fPhase)
-                               {
-                                       if (cls->fUseGlobalTracking)
-                                       {
-                                               int realSlice = i + 1;
-                                               if (realSlice % (fgkNSlices / 2) < 1) realSlice -= fgkNSlices / 2;
-
-                                               if (realSlice % (fgkNSlices / 2) != 1)
-                                               {
-                                                       cls->GlobalTracking(realSlice, par->fNum + 1, par);
-                                               }
-
-                                               if (realSlice == 19)
-                                               {
-                                                       mustRunSlice19 = 1;
-                                               }
-                                               else
-                                               {
-                                                       while (cls->fSliceLeftGlobalReady[realSlice] == 0 || cls->fSliceRightGlobalReady[realSlice] == 0)
-                                                       {
-                                                               if (par->fReset) goto ResetHelperThread;
-                                                       }
-                                                       cls->WriteOutput(par->pOutput, par->fFirstSlice, realSlice, par->fNum + 1);
-                                               }
-                                       }
-                                       else
-                                       {
-                                               while (cls->fSliceOutputReady < i)
-                                               {
-                                                       if (par->fReset) goto ResetHelperThread;
-                                               }
-                                               cls->WriteOutput(par->pOutput, par->fFirstSlice, i, par->fNum + 1);
-                                       }
-                               }
-                               else
-                               {
-                                       cls->ReadEvent(par->pClusterData, par->fFirstSlice, i, par->fNum + 1);
-                                       par->fDone = i + 1;
-                               }
-                               //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
-                       }
-                       if (mustRunSlice19)
-                       {
-                               while (cls->fSliceLeftGlobalReady[19] == 0 || cls->fSliceRightGlobalReady[19] == 0)
-                               {
-                                       if (par->fReset) goto ResetHelperThread;
-                               }
-                               cls->WriteOutput(par->pOutput, par->fFirstSlice, 19, par->fNum + 1);
-                       }
-               }
-ResetHelperThread:
-               cls->ResetThisHelperThread(par);
-       }
-#ifdef HLTCA_STANDALONE
-       if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d terminating", par->fNum);
-#endif
-       delete tmpTracker;
-       pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
-       pthread_exit(NULL);
-       return(NULL);
-}
-
-void AliHLTTPCCAGPUTrackerNVCC::ResetThisHelperThread(AliHLTTPCCAGPUTrackerNVCC::helperParam* par)
-{
-       if (par->fReset) HLTImportant("GPU Helper Thread %d reseting", par->fNum);
-       par->fReset = false;
-       pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
-}
-
-#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
-
-AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :
-fGpuTracker(NULL),
-fGPUMemory(NULL),
-fHostLockedMemory(NULL),
-fGPUMergerMemory(NULL),
-fGPUMergerHostMemory(NULL),
-fGPUMergerMaxMemory(0),
-fDebugLevel(0),
-fDebugMask(0xFFFFFFFF),
-fOutFile(NULL),
-fGPUMemSize(0),
-fpCudaStreams(NULL),
-fSliceCount(HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT),
-fCudaDevice(0),
-fOutputControl(NULL),
-fThreadId(0),
-fCudaInitialized(0),
-fPPMode(0),
-fSelfheal(0),
-fConstructorBlockCount(30),
-selectorBlockCount(30),
-fCudaContext(NULL),
-fNHelperThreads(HLTCA_GPU_DEFAULT_HELPER_THREADS),
-fHelperParams(NULL),
-fHelperMemMutex(NULL),
-fSliceOutputReady(0),
-fSliceGlobalMutexes(NULL),
-fNCPUTrackers(0),
-fNSlicesPerCPUTracker(0),
-fGlobalTracking(0),
-fUseGlobalTracking(0),
-fNSlaveThreads(0)
+AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() : fpCudaStreams(NULL)
 {
        fCudaContext = (void*) new CUcontext;
 };
@@ -296,113 +74,22 @@ AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC()
        delete (CUcontext*) fCudaContext;
 };
 
-void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)
-{
-       //Release the global named semaphore that locks GPU Initialization
-#ifdef R__WIN32
-       HANDLE* h = (HANDLE*) sem;
-       ReleaseSemaphore(*h, 1, NULL);
-       CloseHandle(*h);
-       delete h;
-#else
-       sem_t* pSem = (sem_t*) sem;
-       sem_post(pSem);
-       sem_unlink(SemLockName);
-#endif
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)
-{
-       //Check constants for correct memory sizes
-       if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
-       {
-               HLTError("Insufficiant Tracker Object Memory for %d slices", sliceCount);
-               return(1);
-       }
-
-       if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
-       {
-               HLTError("Insufficiant Common Memory");
-               return(1);
-       }
-
-       if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
-       {
-               HLTError("Insufficiant Row Memory");
-               return(1);
-       }
-
-       if (fDebugLevel >= 3)
-       {
-               HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", (int) sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, (int) (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize()), HLTCA_GPU_COMMON_MEMORY, (int) (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow)), HLTCA_GPU_ROWS_MEMORY);
-       }
-       return(0);
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
+int AliHLTTPCCAGPUTrackerNVCC::InitGPU_Runtime(int sliceCount, int forceDeviceID)
 {
        //Find best CUDA device, initialize and allocate memory
 
-#if defined(HLTCA_STANDALONE) & !defined(_WIN32)
-       cpu_set_t mask;
-       CPU_ZERO(&mask);
-       CPU_SET(0, &mask);
-       //sched_setaffinity(0, sizeof(mask), &mask);
-#endif
-
-       if (sliceCount == -1) sliceCount = fSliceCount;
-
-       if (CheckMemorySizes(sliceCount)) return(1);
-
-#ifdef R__WIN32
-       HANDLE* semLock = new HANDLE;
-       *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
-       if (*semLock == NULL)
-       {
-               HLTError("Error creating GPUInit Semaphore");
-               return(1);
-       }
-       WaitForSingleObject(*semLock, INFINITE);
-#else
-       sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
-       if (semLock == SEM_FAILED)
-       {
-               HLTError("Error creating GPUInit Semaphore");
-               return(1);
-       }
-       timespec semtime;
-       clock_gettime(CLOCK_REALTIME, &semtime);
-       semtime.tv_sec += 10;
-       while (sem_timedwait(semLock, &semtime) != 0)
-       {
-               HLTError("Global Lock for GPU initialisation was not released for 10 seconds, assuming another thread died");
-               HLTWarning("Resetting the global lock");
-               sem_post(semLock);
-       }
-#endif
-
-       fThreadId = GetThread();
-
        cudaDeviceProp fCudaDeviceProp;
 
-       fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
-
-#ifdef HLTCA_GPU_MERGER
-       fGPUMergerMaxMemory = 2000000 * 5 * sizeof(float);
-       fGPUMemSize += fGPUMergerMaxMemory;
-#endif
-
 #ifndef CUDA_DEVICE_EMULATION
        int count, bestDevice = -1;
        long long int bestDeviceSpeed = 0, deviceSpeed;
-       if (CudaFailedMsg(cudaGetDeviceCount(&count)))
+       if (GPUFailedMsg(cudaGetDeviceCount(&count)))
        {
                HLTError("Error getting CUDA Device Count");
-               ReleaseGlobalLock(semLock);
                return(1);
        }
        if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
-#ifdef FERMI
+#if defined(FERMI) || defined(KEPLER)
        const int reqVerMaj = 2;
        const int reqVerMin = 0;
 #else
@@ -425,7 +112,7 @@ int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
                if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
                cuCtxDestroy(tmpContext);
                if (fDebugLevel >= 4) printf("Obtained current memory usage for device %d\n", i);
-               if (CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;
+               if (GPUFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;
                if (fDebugLevel >= 4) printf("Obtained device properties for device %d\n", i);
                int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < reqVerMaj || (fCudaDeviceProp.major == reqVerMaj && fCudaDeviceProp.minor < reqVerMin)) && free >= fGPUMemSize + 100 * 1024 + 1024;
 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
@@ -444,7 +131,6 @@ int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
        {
                HLTWarning("No %sCUDA Device available, aborting CUDA Initialisation", count ? "appropriate " : "");
                HLTInfo("Requiring Revision %d.%d, Mem: %lld, Multiprocessors: %d", reqVerMaj, reqVerMin, fGPUMemSize + 100 * 1024 * 1024, sliceCount);
-               ReleaseGlobalLock(semLock);
                return(1);
        }
 
@@ -483,32 +169,28 @@ int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
        if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
        {
                HLTError( "Unsupported CUDA Device" );
-               ReleaseGlobalLock(semLock);
                return(1);
        }
 
        if (cuCtxCreate((CUcontext*) fCudaContext, CU_CTX_SCHED_AUTO, fCudaDevice) != CUDA_SUCCESS)
        {
                HLTError("Could not set CUDA Device!");
-               ReleaseGlobalLock(semLock);
                return(1);
        }
 
-       if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
+       if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || GPUFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
        {
                HLTError("CUDA Memory Allocation Error");
                cudaThreadExit();
-               ReleaseGlobalLock(semLock);
                return(1);
        }
        fGPUMergerMemory = ((char*) fGPUMemory) + fGPUMemSize - fGPUMergerMaxMemory;
-       ReleaseGlobalLock(semLock);
        if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
        int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
 #ifdef HLTCA_GPU_MERGER
        hostMemSize += fGPUMergerMaxMemory;
 #endif
-       if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
+       if (GPUFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
        {
                cudaFree(fGPUMemory);
                cudaThreadExit();
@@ -520,25 +202,14 @@ int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
 
        if (fDebugLevel >= 1)
        {
-               CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
-       }
-
-       fSliceCount = sliceCount;
-       //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
-       fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
-
-       for (int i = 0;i < fgkNSlices;i++)
-       {
-               fSlaveTrackers[i].SetGPUTracker();
-               fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
-               fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
+               GPUFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
        }
 
        fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
        cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
        for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
        {
-               if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
+               if (GPUFailedMsg(cudaStreamCreate(&cudaStreams[i])))
                {
                        cudaFree(fGPUMemory);
                        cudaFreeHost(fHostLockedMemory);
@@ -548,175 +219,13 @@ int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
                }
        }
 
-       if (StartHelperThreads()) return(1);
-
-       fHelperMemMutex = malloc(sizeof(pthread_mutex_t));
-       if (fHelperMemMutex == NULL)
-       {
-               HLTError("Memory allocation error");
-               cudaFree(fGPUMemory);
-               cudaFreeHost(fHostLockedMemory);
-               cudaThreadExit();
-               return(1);
-       }
-
-       if (pthread_mutex_init((pthread_mutex_t*) fHelperMemMutex, NULL))
-       {
-               HLTError("Error creating pthread mutex");
-               cudaFree(fGPUMemory);
-               cudaFreeHost(fHostLockedMemory);
-               cudaThreadExit();
-               return(1);
-       }
-
-       fSliceGlobalMutexes = malloc(sizeof(pthread_mutex_t) * fgkNSlices);
-       if (fSliceGlobalMutexes == NULL)
-       {
-               HLTError("Memory allocation error");
-               cudaFree(fGPUMemory);
-               cudaFreeHost(fHostLockedMemory);
-               cudaThreadExit();
-               return(1);
-       }
-       for (int i = 0;i < fgkNSlices;i++)
-       {
-               if (pthread_mutex_init(&((pthread_mutex_t*) fSliceGlobalMutexes)[i], NULL))
-               {
-                       HLTError("Error creating pthread mutex");
-                       cudaFree(fGPUMemory);
-                       cudaFreeHost(fHostLockedMemory);
-                       cudaThreadExit();
-                       return(1);
-               }
-       }
-
        cuCtxPopCurrent((CUcontext*) fCudaContext);
-       fCudaInitialized = 1;
        HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", fCudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount);
 
-#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
-       if (fDebugLevel < 2 && 0)
-       {
-               //Do one initial run for Benchmark reasons
-               const int useDebugLevel = fDebugLevel;
-               fDebugLevel = 0;
-               AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
-
-               std::ifstream fin;
-
-               AliHLTTPCCAParam tmpParam;
-               AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
-
-               fin.open("events/settings.dump");
-               int tmpCount;
-               fin >> tmpCount;
-               for (int i = 0;i < sliceCount;i++)
-               {
-                       fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
-                       tmpParam.ReadSettings(fin);
-                       InitializeSliceParam(i, tmpParam);
-               }
-               fin.close();
-
-               fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
-               for (int i = 0;i < sliceCount;i++)
-               {
-                       tmpCluster[i].StartReading(i, 0);
-                       tmpCluster[i].ReadEvent(fin);
-               }
-               fin.close();
-
-               AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
-               memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
-
-               Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
-               for (int i = 0;i < sliceCount;i++)
-               {
-                       free(tmpOutput[i]);
-                       tmpOutput[i] = NULL;
-                       fSlaveTrackers[i].SetOutputControl(NULL);
-               }
-               delete[] tmpOutput;
-               delete[] tmpCluster;
-               fDebugLevel = useDebugLevel;
-       }
-#endif
-
        return(0);
 }
 
-int AliHLTTPCCAGPUTrackerNVCC::StartHelperThreads()
-{
-       int nThreads = fNHelperThreads + fNCPUTrackers;
-       if (nThreads)
-       {
-               fHelperParams = new helperParam[nThreads];
-               if (fHelperParams == NULL)
-               {
-                       HLTError("Memory allocation error");
-                       cudaFree(fGPUMemory);
-                       cudaFreeHost(fHostLockedMemory);
-                       cudaThreadExit();
-                       return(1);
-               }       
-               for (int i = 0;i < nThreads;i++)
-               {
-                       fHelperParams[i].fCls = this;
-                       fHelperParams[i].fTerminate = false;
-                       fHelperParams[i].fReset = false;
-                       fHelperParams[i].fNum = i;
-                       fHelperParams[i].fMutex = malloc(2 * sizeof(pthread_mutex_t));
-                       if (fHelperParams[i].fMutex == NULL)
-                       {
-                               HLTError("Memory allocation error");
-                               cudaFree(fGPUMemory);
-                               cudaFreeHost(fHostLockedMemory);
-                               cudaThreadExit();
-                               return(1);
-                       }
-                       for (int j = 0;j < 2;j++)
-                       {
-                               if (pthread_mutex_init(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j], NULL))
-                               {
-                                       HLTError("Error creating pthread mutex");
-                                       cudaFree(fGPUMemory);
-                                       cudaFreeHost(fHostLockedMemory);
-                                       cudaThreadExit();
-                                       return(1);
-                               }
-
-                               pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
-                       }
-                       fHelperParams[i].fThreadId = (void*) malloc(sizeof(pthread_t));
-
-                       if (pthread_create((pthread_t*) fHelperParams[i].fThreadId, NULL, helperWrapper, &fHelperParams[i]))
-                       {
-                               HLTError("Error starting slave thread");
-                               cudaFree(fGPUMemory);
-                               cudaFreeHost(fHostLockedMemory);
-                               cudaThreadExit();
-                       }
-               }
-       }
-       fNSlaveThreads = nThreads;
-       return(0);
-}
-
-template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)
-{
-       //Macro to align Pointers.
-       //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
-       //(As long as every single data structure is <= 1 MB)
-
-       size_t adr = (size_t) ptr;
-       if (adr % alignment)
-       {
-               adr += alignment - (adr % alignment);
-       }
-       return((T*) adr);
-}
-
-bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsgA(cudaError_t error, const char* file, int line)
+bool AliHLTTPCCAGPUTrackerNVCC::GPUFailedMsgA(cudaError_t error, const char* file, int line)
 {
        //Check for CUDA Error and in the case of an error display the corresponding error string
        if (error == cudaSuccess) return(false);
@@ -724,7 +233,7 @@ bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsgA(cudaError_t error, const char* fi
        return(true);
 }
 
-int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice)
+int AliHLTTPCCAGPUTrackerNVCC::GPUSync(char* state, int stream, int slice)
 {
        //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
 
@@ -733,81 +242,18 @@ int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice)
        cuErr = cudaGetLastError();
        if (cuErr != cudaSuccess)
        {
-               HLTError("Cuda Error %s while running kernel (%s) (Slice %d; %d/%d)", cudaGetErrorString(cuErr), state, sliceLocal, slice, fgkNSlices);
+               HLTError("Cuda Error %s while running kernel (%s) (Stream %d; %d/%d)", cudaGetErrorString(cuErr), state, stream, slice, fgkNSlices);
                return(1);
        }
-       if (CudaFailedMsg(cudaThreadSynchronize()))
+       if (GPUFailedMsg(cudaThreadSynchronize()))
        {
-               HLTError("CUDA Error while synchronizing (%s) (Slice %d; %d/%d)", state, sliceLocal, slice, fgkNSlices);
+               HLTError("CUDA Error while synchronizing (%s) (Stream %d; %d/%d)", state, stream, slice, fgkNSlices);
                return(1);
        }
        if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
        return(0);
 }
 
-void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
-{
-       //Set Debug Level and Debug output File if applicable
-       fDebugLevel = dwLevel;
-       if (NewOutFile) fOutFile = NewOutFile;
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int OptionValue)
-{
-       //Set a specific GPU Tracker Option
-       if (strcmp(OptionName, "PPMode") == 0)
-       {
-               fPPMode = OptionValue;
-       }
-       else if (strcmp(OptionName, "DebugMask") == 0)
-       {
-               fDebugMask = OptionValue;
-       }
-       else if (strcmp(OptionName, "HelperThreads") == 0)
-       {
-               fNHelperThreads = OptionValue;
-       }
-       else if (strcmp(OptionName, "CPUTrackers") == 0)
-       {
-               fNCPUTrackers = OptionValue;
-       }
-       else if (strcmp(OptionName, "SlicesPerCPUTracker") == 0)
-       {
-               fNSlicesPerCPUTracker = OptionValue;
-       }
-       else if (strcmp(OptionName, "GlobalTracking") == 0)
-       {
-               fGlobalTracking = OptionValue;
-       }
-       else
-       {
-               HLTError("Unknown Option: %s", OptionName);
-               return(1);
-       }
-
-       if (fNHelperThreads + fNCPUTrackers > fNSlaveThreads && fCudaInitialized)
-       {
-               HLTInfo("Insufficient Slave Threads available (%d), creating additional Slave Threads (%d+%d)\n", fNSlaveThreads, fNHelperThreads, fNCPUTrackers);
-               StopHelperThreads();
-               StartHelperThreads();
-       }
-
-       return(0);
-}
-
-#ifdef HLTCA_STANDALONE
-void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)
-{
-       //Run Performance Query for timer i of slice iSlice
-       if (fDebugLevel >= 1)
-       {
-               AliHLTTPCCATracker::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
-       }
-}
-#else
-void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
-#endif
-
 #if defined(BITWISE_COMPATIBLE_DEBUG_OUTPUT) || defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER)
 void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker*, int, bool) {}
 #else
@@ -823,10 +269,10 @@ void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int i
                int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
                int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
                uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * fConstructorBlockCount);
-               CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
-               CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
-               CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * fConstructorBlockCount, cudaMemcpyDeviceToHost));
-               CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
+               GPUFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
+               GPUFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
+               GPUFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * fConstructorBlockCount, cudaMemcpyDeviceToHost));
+               GPUFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 
                int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
                for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
@@ -877,235 +323,50 @@ __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTra
 {
        //Initialize GPU RowBlocks and HitWeights
        int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
-       const int stride = blockDim.x * gridDim.x;
+       const int stride = get_global_size(0);
        int4 i0;
        i0.x = i0.y = i0.z = i0.w = 0;
 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
        int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
        int4 i1;
        i1.x = i1.y = i1.z = i1.w = -1;
-       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
+       for (int i = get_global_id(0);i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
                RowBlockPos[i] = i0;
-       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
+       for (int i = get_global_id(0);i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
                rowBlockTracklets4[i] = i1;
 #endif
-       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
+       for (int i = get_global_id(0);i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
                sliceDataHitWeights4[i] = i0;
 }
 
-int AliHLTTPCCAGPUTrackerNVCC::SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
-{
-       if (!fSelfheal)
-       {
-               cuCtxPopCurrent((CUcontext*) fCudaContext);
-               return(1);
-       }
-       static bool selfHealing = false;
-       if (selfHealing)
-       {
-               HLTError("Selfhealing failed, giving up");
-               cuCtxPopCurrent((CUcontext*) fCudaContext);
-               return(1);
-       }
-       else
-       {
-               HLTError("Unsolvable CUDA error occured, trying to reinitialize GPU");
-       }                       
-       selfHealing = true;
-       ExitGPU();
-       if (InitGPU(fSliceCount, fCudaDevice))
-       {
-               HLTError("Could not reinitialize CUDA device, disabling GPU tracker");
-               ExitGPU();
-               return(1);
-       }
-       HLTInfo("GPU tracker successfully reinitialized, restarting tracking");
-       int retVal = Reconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal);
-       selfHealing = false;
-       return(retVal);
-}
-
-void AliHLTTPCCAGPUTrackerNVCC::ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId)
-{
-       fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
-#ifdef HLTCA_GPU_TIME_PROFILE
-       unsigned long long int a, b;
-       AliHLTTPCCATracker::StandaloneQueryTime(&a);
-#endif
-       fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
-#ifdef HLTCA_GPU_TIME_PROFILE
-       AliHLTTPCCATracker::StandaloneQueryTime(&b);
-       printf("Read %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
-#endif
-}
-
-void AliHLTTPCCAGPUTrackerNVCC::WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId)
-{
-       if (fDebugLevel >= 3) printf("GPU Tracker running WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);
-       fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
-#ifdef HLTCA_GPU_TIME_PROFILE
-       unsigned long long int a, b;
-       AliHLTTPCCATracker::StandaloneQueryTime(&a);
-#endif
-       if (fNHelperThreads) pthread_mutex_lock((pthread_mutex_t*) fHelperMemMutex);
-       fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare();
-       if (fNHelperThreads) pthread_mutex_unlock((pthread_mutex_t*) fHelperMemMutex);
-       fSlaveTrackers[firstSlice + iSlice].WriteOutput();
-#ifdef HLTCA_GPU_TIME_PROFILE
-       AliHLTTPCCATracker::StandaloneQueryTime(&b);
-       printf("Write %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
-#endif
-       if (fDebugLevel >= 3) printf("GPU Tracker finished WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);
-}
-
 int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
 {
        //Primary reconstruction function
 
        cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 
-       if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
-
-       if (!fCudaInitialized)
-       {
-               HLTError("GPUTracker not initialized");
-               return(1);
-       }
-       if (sliceCountLocal > fSliceCount)
-       {
-               HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
-               return(1);
-       }
-       if (fThreadId != GetThread())
-       {
-               HLTWarning("CUDA thread changed, migrating context, Previous Thread: %d, New Thread: %d", fThreadId, GetThread());
-               fThreadId = GetThread();
-       }
-
-       if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
-
-       if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
-       {
-               HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
-               return(1);
-       }
-       
-       cuCtxPushCurrent(*((CUcontext*) fCudaContext));
-       if (fPPMode)
-       {
-               int retVal = ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal);
-               cuCtxPopCurrent((CUcontext*) fCudaContext);
-               return(retVal);
-       }
-
-       for (int i = fNHelperThreads;i < fNCPUTrackers + fNHelperThreads;i++)
-       {
-               fHelperParams[i].CPUTracker = 1;
-               fHelperParams[i].pClusterData = pClusterData;
-               fHelperParams[i].pOutput = pOutput;
-               fHelperParams[i].fSliceCount = sliceCountLocal;
-               fHelperParams[i].fFirstSlice = firstSlice;
-               pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
-       }
-       sliceCountLocal -= fNCPUTrackers * fNSlicesPerCPUTracker;
-       if (sliceCountLocal < 0) sliceCountLocal = 0;
-
-       fUseGlobalTracking = fGlobalTracking && sliceCountLocal == fgkNSlices;
-
-       memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
-
-       if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
-
-#ifdef HLTCA_GPU_TIME_PROFILE
-       AliHLTTPCCATracker::StandaloneQueryFreq(&fProfTimeC);
-       AliHLTTPCCATracker::StandaloneQueryTime(&fProfTimeD);
-#endif
-
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
-       {
-               //Make this a GPU Tracker
-               fGpuTracker[iSlice].SetGPUTracker();
-               fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
-               fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
-               fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
-
-               //Set Pointers to GPU Memory
-               char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
-
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
-               tmpMem = alignPointer(tmpMem, 1024 * 1024);
-
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
-               tmpMem = alignPointer(tmpMem, 1024 * 1024);
-
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
-               tmpMem = alignPointer(tmpMem, 1024 * 1024);
-
-               if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY RANDOM_ERROR)
-               {
-                       HLTError("Insufficiant Track Memory");
-                       cudaThreadSynchronize();
-                       cuCtxPopCurrent((CUcontext*) fCudaContext);
-                       ResetHelperThreads(0);
-                       return(1);
-               }
-
-               if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY RANDOM_ERROR)
-               {
-                       HLTError("Insufficiant Global Memory");
-                       cudaThreadSynchronize();
-                       cuCtxPopCurrent((CUcontext*) fCudaContext);
-                       ResetHelperThreads(0);
-                       return(1);
-               }
-
-               if (fDebugLevel >= 3)
-               {
-                       HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", (int) (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice)), HLTCA_GPU_GLOBAL_MEMORY, (int) fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY);
-               }
-
-               //Initialize Startup Constants
-               *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
-               *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
-               *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
-               fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = sliceCountLocal > fConstructorBlockCount ? (iSlice < fConstructorBlockCount) : fConstructorBlockCount * (iSlice + 1) / sliceCountLocal - fConstructorBlockCount * (iSlice) / sliceCountLocal;
-               if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
-               fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
-               fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
-               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
-               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fNextTracklet = (fConstructorBlockCount / sliceCountLocal + (fConstructorBlockCount % sliceCountLocal > iSlice)) * HLTCA_GPU_THREAD_COUNT;
-               fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
-       }
+       if (Reconstruct_Base_Init(pOutput, pClusterData, firstSlice, sliceCountLocal)) return(1);
 
 #ifdef HLTCA_GPU_TEXTURE_FETCH
        cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
        size_t offset;
-       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
+       if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
        {
                HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
-               cudaThreadSynchronize();
-               cuCtxPopCurrent((CUcontext*) fCudaContext);
                ResetHelperThreads(0);
                return(1);
        }
        cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
-       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
+       if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
        {
                HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
-               cudaThreadSynchronize();
-               cuCtxPopCurrent((CUcontext*) fCudaContext);
                ResetHelperThreads(0);
                return(1);
        }
        cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
-       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
+       if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
        {
                HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
-               cudaThreadSynchronize();
-               cuCtxPopCurrent((CUcontext*) fCudaContext);
                ResetHelperThreads(0);
                return(1);
        }
@@ -1115,90 +376,39 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali
        if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
        char* tmpMem;
-       if (CudaFailedMsg(cudaMalloc(&tmpMem, 100000000)))
+       if (GPUFailedMsg(cudaMalloc(&tmpMem, 100000000)))
        {
                HLTError("Error allocating CUDA profile memory");
-               cudaThreadSynchronize();
-               cuCtxPopCurrent((CUcontext*) fCudaContext);
                ResetHelperThreads(0);
                return(1);
        }
        fGpuTracker[0].fStageAtSync = tmpMem;
-       CudaFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));
+       GPUFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));
 #endif
-       CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
-       if (CUDASync("Initialization (1)", 0, firstSlice) RANDOM_ERROR)
+       GPUFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
+       if (GPUSync("Initialization (1)", 0, firstSlice) RANDOM_ERROR)
        {
-               cudaThreadSynchronize();
-               cuCtxPopCurrent((CUcontext*) fCudaContext);
                ResetHelperThreads(0);
                return(1);
        }
 
-       for (int i = 0;i < fNHelperThreads;i++)
-       {
-               fHelperParams[i].CPUTracker = 0;
-               fHelperParams[i].fDone = 0;
-               fHelperParams[i].fPhase = 0;
-               fHelperParams[i].pClusterData = pClusterData;
-               fHelperParams[i].fSliceCount = sliceCountLocal;
-               fHelperParams[i].fFirstSlice = firstSlice;
-               pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
-       }
-
        for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
        {
-               StandalonePerfTime(firstSlice + iSlice, 0);
-
-               //Initialize GPU Slave Tracker
-               if (fDebugLevel >= 3) HLTInfo("Creating Slice Data (Slice %d)", iSlice);
-               if (iSlice % (fNHelperThreads + 1) == 0)
-               {
-                       ReadEvent(pClusterData, firstSlice, iSlice, 0);
-               }
-               else
-               {
-                       if (fDebugLevel >= 3) HLTInfo("Waiting for helper thread %d", iSlice % (fNHelperThreads + 1) - 1);
-                       while(fHelperParams[iSlice % (fNHelperThreads + 1) - 1].fDone < iSlice);
-               }
-
-               if (fDebugLevel >= 4)
-               {
-#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
-                       *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl;
-#endif
-                       if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);
-               }
-
-               if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY RANDOM_ERROR)
-               {
-                       HLTError("Insufficiant Slice Data Memory");
-                       cudaThreadSynchronize();
-                       cuCtxPopCurrent((CUcontext*) fCudaContext);
-                       ResetHelperThreads(1);
-                       return(1);
-               }
-
-               if (fDebugLevel >= 3)
-               {
-                       HLTInfo("GPU Slice Data Memory Used: %d/%d", (int) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY);
-               }
+               if (Reconstruct_Base_SliceInit(pClusterData, iSlice, firstSlice)) return(1);
 
                //Initialize temporary memory where needed
                if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");           
                PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
-               if (CUDASync("Initialization (2)", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               if (GPUSync("Initialization (2)", 2, iSlice + firstSlice) RANDOM_ERROR)
                {
-                       cudaThreadSynchronize();
-                       cuCtxPopCurrent((CUcontext*) fCudaContext);
                        ResetHelperThreads(1);
                        return(1);
                }
 
                //Copy Data to GPU Global Memory
-               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
-               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
-               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
+               GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
+               GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
+               GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 
                if (fDebugLevel >= 4)
                {
@@ -1207,10 +417,8 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali
                        fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
                }
 
-               if (CUDASync("Initialization (3)", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               if (GPUSync("Initialization (3)", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
                {
-                       cudaThreadSynchronize();
-                       cuCtxPopCurrent((CUcontext*) fCudaContext);
                        ResetHelperThreads(1);
                        return(1);
                }
@@ -1219,10 +427,8 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali
                if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);
                AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), HLTCA_GPU_THREAD_COUNT_FINDER, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 
-               if (CUDASync("Neighbours finder", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               if (GPUSync("Neighbours finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
                {
-                       cudaThreadSynchronize();
-                       cuCtxPopCurrent((CUcontext*) fCudaContext);
                        ResetHelperThreads(1);
                        return(1);
                }
@@ -1231,16 +437,14 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali
 
                if (fDebugLevel >= 4)
                {
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
+                       GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
                        if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
                }
 
                if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);
                AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
-               if (CUDASync("Neighbours Cleaner", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               if (GPUSync("Neighbours Cleaner", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
                {
-                       cudaThreadSynchronize();
-                       cuCtxPopCurrent((CUcontext*) fCudaContext);
                        ResetHelperThreads(1);
                        return(1);
                }
@@ -1249,16 +453,14 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali
 
                if (fDebugLevel >= 4)
                {
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
+                       GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
                        if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
                }
 
                if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);
                AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
-               if (CUDASync("Start Hits Finder", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               if (GPUSync("Start Hits Finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
                {
-                       cudaThreadSynchronize();
-                       cuCtxPopCurrent((CUcontext*) fCudaContext);
                        ResetHelperThreads(1);
                        return(1);
                }
@@ -1267,10 +469,8 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali
 
                if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);
                AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
-               if (CUDASync("Start Hits Sorter", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               if (GPUSync("Start Hits Sorter", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
                {
-                       cudaThreadSynchronize();
-                       cuCtxPopCurrent((CUcontext*) fCudaContext);
                        ResetHelperThreads(1);
                        return(1);
                }
@@ -1279,13 +479,11 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali
 
                if (fDebugLevel >= 2)
                {
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
+                       GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
                        if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);
                        if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS RANDOM_ERROR)
                        {
                                HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
-                               cudaThreadSynchronize();
-                               cuCtxPopCurrent((CUcontext*) fCudaContext);
                                ResetHelperThreads(1);
                                return(1);
                        }
@@ -1294,14 +492,14 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali
                if (fDebugLevel >= 4 && *fSlaveTrackers[firstSlice + iSlice].NTracklets())
                {
 #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
+                       GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
                        if (fDebugMask & 8)
                        {
                                *fOutFile << "Temporary ";
                                fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
                        }
                        uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
-                       CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
+                       GPUFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
                        if (fDebugMask & 16)
                        {
                                *fOutFile << "Start Hits Sort Vector:" << std::endl;
@@ -1313,7 +511,7 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali
                        free(tmpMemory);
 #endif
 
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
+                       GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
                        if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
                }
 
@@ -1337,7 +535,7 @@ RestartTrackletConstructor:
        for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
        {
                AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
-               if (CUDASync("Tracklet Initializer", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               if (GPUSync("Tracklet Initializer", -1, iSlice + firstSlice) RANDOM_ERROR)
                {
                        cudaThreadSynchronize();
                        cuCtxPopCurrent((CUcontext*) fCudaContext);
@@ -1349,7 +547,7 @@ RestartTrackletConstructor:
 
        if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
        AliHLTTPCCATrackletConstructorGPU<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>();
-       if (CUDASync("Tracklet Constructor", 0, firstSlice) RANDOM_ERROR)
+       if (GPUSync("Tracklet Constructor", -1, firstSlice) RANDOM_ERROR)
        {
                cudaThreadSynchronize();
                cuCtxPopCurrent((CUcontext*) fCudaContext);
@@ -1363,13 +561,13 @@ RestartTrackletConstructor:
                for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
                {
                        if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
+                       GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
                        if (fDebugLevel >= 5)
                        {
                                HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
                        }
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
+                       GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
+                       GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
                        if (0 && fSlaveTrackers[firstSlice + iSlice].NTracklets() && fSlaveTrackers[firstSlice + iSlice].Tracklet(0).NHits() < 0)
                        {
                                cudaThreadSynchronize();
@@ -1387,7 +585,7 @@ RestartTrackletConstructor:
                if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
                if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
                AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
-               if (CUDASync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               if (GPUSync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR)
                {
                        cudaThreadSynchronize();
                        cuCtxPopCurrent((CUcontext*) fCudaContext);
@@ -1398,33 +596,8 @@ RestartTrackletConstructor:
 
        char *tmpMemoryGlobalTracking = NULL;
        fSliceOutputReady = 0;
-       if (fUseGlobalTracking)
-       {
-               int tmpmemSize = sizeof(AliHLTTPCCATracklet)
-#ifdef EXTERN_ROW_HITS
-               + HLTCA_ROW_COUNT * sizeof(int)
-#endif
-               + 16;
-               tmpMemoryGlobalTracking = (char*) malloc(tmpmemSize * fgkNSlices);
-               for (int i = 0;i < fgkNSlices;i++)
-               {
-                       fSliceLeftGlobalReady[i] = 0;
-                       fSliceRightGlobalReady[i] = 0;
-               }
-               memset(fGlobalTrackingDone, 0, fgkNSlices);
-               memset(fWriteOutputDone, 0, fgkNSlices);
-
-               for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
-               {
-                       fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(tmpMemoryGlobalTracking + (tmpmemSize * iSlice), 1, fConstructorBlockCount);
-               }
-       }
-       for (int i = 0;i < fNHelperThreads;i++)
-       {
-               fHelperParams[i].fPhase = 1;
-               fHelperParams[i].pOutput = pOutput;
-               pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
-       }
+       
+       if (Reconstruct_Base_StartGlobal(pOutput, tmpMemoryGlobalTracking)) return(1);
 
        int tmpSlice = 0, tmpSlice2 = 0;
        for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
@@ -1433,10 +606,10 @@ RestartTrackletConstructor:
 
                while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == (cudaError_t) CUDA_SUCCESS))
                {
-                       if (CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice])) RANDOM_ERROR)
+                       if (GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice])) RANDOM_ERROR)
                        {
                                ResetHelperThreads(1);
-                               cudaThreadSynchronize();
+                               ActivateThreadContext();
                                return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
                        }
                        tmpSlice++;
@@ -1444,21 +617,24 @@ RestartTrackletConstructor:
 
                while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == (cudaError_t) CUDA_SUCCESS)
                {
-                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
-                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
+                       if (*fSlaveTrackers[firstSlice + tmpSlice2].NTracks() > 0)
+                       {
+                               GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
+                               GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
+                       }
                        tmpSlice2++;
                }
 
-               if (CudaFailedMsg(cudaStreamSynchronize(cudaStreams[iSlice])) RANDOM_ERROR)
+               if (GPUFailedMsg(cudaStreamSynchronize(cudaStreams[iSlice])) RANDOM_ERROR)
                {
                        ResetHelperThreads(1);
-                       cudaThreadSynchronize();
+                       ActivateThreadContext();
                        return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
                }
 
                if (fDebugLevel >= 4)
                {
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
+                       GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
 #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
                        if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
 #endif
@@ -1481,81 +657,28 @@ RestartTrackletConstructor:
                                if (fDebugLevel >= 4)
                                {
                                        ResetHelperThreads(1);
-                                       cudaThreadSynchronize();
-                                       cuCtxPopCurrent((CUcontext*) fCudaContext);
                                        return(1);
                                }
                                for (int i = 0;i < sliceCountLocal;i++)
                                {
                                        cudaThreadSynchronize();
-                                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
+                                       GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
                                        *fSlaveTrackers[firstSlice + i].NTracks() = 0;
                                        *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
                                        fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
-                                       CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
+                                       GPUFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
                                        PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
                                }
                                goto RestartTrackletConstructor;
                        }
 #endif
                        HLTError("GPU Tracker returned Error Code %d in slice %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError, firstSlice + iSlice);
-                       cudaThreadSynchronize();
-                       cuCtxPopCurrent((CUcontext*) fCudaContext);
                        ResetHelperThreads(1);
                        return(1);
                }
                if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
 
-               fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks;
-               fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits;
-               if (fUseGlobalTracking) fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracklets = 1;
-
-               if (fDebugLevel >= 3) HLTInfo("Data ready for slice %d, helper thread %d", iSlice, iSlice % (fNHelperThreads + 1));
-               fSliceOutputReady = iSlice;
-
-               if (fUseGlobalTracking)
-               {
-                       if (iSlice % (fgkNSlices / 2) == 2)
-                       {
-                               int tmpId = iSlice % (fgkNSlices / 2) - 1;
-                               if (iSlice >= fgkNSlices / 2) tmpId += fgkNSlices / 2;
-                               GlobalTracking(tmpId, 0, NULL);
-                               fGlobalTrackingDone[tmpId] = 1;
-                       }
-                       for (int tmpSlice3a = 0;tmpSlice3a < iSlice;tmpSlice3a += fNHelperThreads + 1)
-                       {
-                               int tmpSlice3 = tmpSlice3a + 1;
-                               if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
-                               if (tmpSlice3 >= iSlice) break;
-
-                               int sliceLeft = (tmpSlice3 + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
-                               int sliceRight = (tmpSlice3 + 1) % (fgkNSlices / 2);
-                               if (tmpSlice3 >= fgkNSlices / 2)
-                               {
-                                       sliceLeft += fgkNSlices / 2;
-                                       sliceRight += fgkNSlices / 2;
-                               }
-
-                               if (tmpSlice3 % (fgkNSlices / 2) != 1 && fGlobalTrackingDone[tmpSlice3] == 0 && sliceLeft < iSlice && sliceRight < iSlice)
-                               {
-                                       GlobalTracking(tmpSlice3, 0, NULL);
-                                       fGlobalTrackingDone[tmpSlice3] = 1;
-                               }
-
-                               if (fWriteOutputDone[tmpSlice3] == 0 && fSliceLeftGlobalReady[tmpSlice3] && fSliceRightGlobalReady[tmpSlice3])
-                               {
-                                       WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
-                                       fWriteOutputDone[tmpSlice3] = 1;
-                               }
-                       }
-               }
-               else
-               {
-                       if (iSlice % (fNHelperThreads + 1) == 0)
-                       {
-                               WriteOutput(pOutput, firstSlice, iSlice, 0);
-                       }
-               }
+               if (Reconstruct_Base_FinishSlices(pOutput, iSlice, firstSlice)) return(1);
 
                if (fDebugLevel >= 4)
                {
@@ -1564,46 +687,7 @@ RestartTrackletConstructor:
                }
        }
 
-       if (fUseGlobalTracking)
-       {
-               for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
-               {
-                       int tmpSlice3 = (tmpSlice3a + 1);
-                       if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
-                       if (fGlobalTrackingDone[tmpSlice3] == 0) GlobalTracking(tmpSlice3, 0, NULL);
-               }
-               for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
-               {
-                       int tmpSlice3 = (tmpSlice3a + 1);
-                       if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
-                       if (fWriteOutputDone[tmpSlice3] == 0)
-                       {
-                               while (fSliceLeftGlobalReady[tmpSlice3] == 0 || fSliceRightGlobalReady[tmpSlice3] == 0);
-                               WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
-                       }
-               }
-       }
-
-       for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
-       {
-               pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
-       }
-
-       if (fUseGlobalTracking)
-       {
-               free(tmpMemoryGlobalTracking);
-               if (fDebugLevel >= 3)
-               {
-                       for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
-                       {
-                               printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits);
-                       }
-               }
-       }
-
-       StandalonePerfTime(firstSlice, 10);
-
-       if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
+       if (Reconstruct_Base_Finalize(pOutput, tmpMemoryGlobalTracking, firstSlice)) return(1);
 
        /*for (int i = firstSlice;i < firstSlice + sliceCountLocal;i++)
        {
@@ -1623,7 +707,7 @@ RestartTrackletConstructor:
 
 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
        char* stageAtSync = (char*) malloc(100000000);
-       CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
+       GPUFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
        cudaFree(fGpuTracker[0].StageAtSync());
 
        FILE* fp = fopen("profile.txt", "w+");
@@ -1692,11 +776,11 @@ __global__ void ClearPPHitWeights(int sliceCount)
                AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k];
                int4* const pHitWeights = (int4*) tracker.Data().HitWeights();
                const int dwCount = tracker.Data().NumberOfHitsPlusAlign();
-               const int stride = blockDim.x * gridDim.x;
+               const int stride = get_global_size(0);
                int4 i0;
                i0.x = i0.y = i0.z = i0.w = 0;
 
-               for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < dwCount * sizeof(int) / sizeof(int4);i += stride)
+               for (int i = get_global_id(0);i < dwCount * sizeof(int) / sizeof(int4);i += stride)
                {
                        pHitWeights[i] = i0;
                }
@@ -1732,7 +816,6 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A
                fGpuTracker[iSlice].SetGPUTracker();
                fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
 
-
                fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice));
                fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
 
@@ -1741,7 +824,6 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A
                tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
                tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024);
 
-
                //Set Pointers to GPU Memory
                char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
 
@@ -1777,26 +859,26 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A
 
                fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
 
-               if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);
+               if (GPUSync("Initialization", -1, iSlice + firstSlice)) return(1);
                StandalonePerfTime(firstSlice + iSlice, 1);
        }
 
 #ifdef HLTCA_GPU_TEXTURE_FETCH
        cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
        size_t offset;
-       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
+       if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
        {
                HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
                return(1);
        }
        cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
-       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
+       if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
        {
                HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
                return(1);
        }
        cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
-       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
+       if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
        {
                HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
                return(1);
@@ -1805,34 +887,34 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A
 
        //Copy Tracker Object to GPU Memory
        if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
-       CudaFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));
+       GPUFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));
 
        //Copy Data to GPU Global Memory
        for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
        {
-               CudaFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));
+               GPUFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));
                //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize());
        }
-       //CudaFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));
+       //GPUFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));
        //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0)));
-       CudaFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));
-       CudaFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));
+       GPUFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));
+       GPUFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));
 
        if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
        AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_FINDER>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows());
-       if (CUDASync("Neighbours finder", 0, firstSlice)) return 1;
+       if (GPUSync("Neighbours finder", -1, firstSlice)) return 1;
        StandalonePerfTime(firstSlice, 2);
        if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
        AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2);
-       if (CUDASync("Neighbours Cleaner", 0, firstSlice)) return 1;
+       if (GPUSync("Neighbours Cleaner", -1, firstSlice)) return 1;
        StandalonePerfTime(firstSlice, 3);
        if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
        AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6);
-       if (CUDASync("Start Hits Finder", 0, firstSlice)) return 1;
+       if (GPUSync("Start Hits Finder", -1, firstSlice)) return 1;
        StandalonePerfTime(firstSlice, 4);
 
        ClearPPHitWeights <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(sliceCountLocal);
-       if (CUDASync("Clear Hit Weights", 0, firstSlice)) return 1;
+       if (GPUSync("Clear Hit Weights", -1, firstSlice)) return 1;
 
        for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
        {
@@ -1843,22 +925,22 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A
 
        if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
        AliHLTTPCCATrackletConstructorGPUPP<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>(0, sliceCountLocal);
-       if (CUDASync("Tracklet Constructor PP", 0, firstSlice)) return 1;
+       if (GPUSync("Tracklet Constructor PP", -1, firstSlice)) return 1;
 
        StandalonePerfTime(firstSlice, 8);
 
        AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR>>>(0, sliceCountLocal);
-       if (CUDASync("Tracklet Selector", 0, firstSlice)) return 1;
+       if (GPUSync("Tracklet Selector", -1, firstSlice)) return 1;
        StandalonePerfTime(firstSlice, 9);
 
-       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));
+       GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));
 
        for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
        {
                if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
 
-               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));
-               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));
+               GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));
+               GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));
 
                if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
                {
@@ -1920,19 +1002,7 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A
        return(0);
 }
 
-int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
-{
-       //Initialize Slice Tracker Parameter for a slave tracker
-       fSlaveTrackers[iSlice].Initialize(param);
-       if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
-       {
-               HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
-               return(1);
-       }
-       return(0);
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
+int AliHLTTPCCAGPUTrackerNVCC::ExitGPU_Runtime()
 {
        //Uninitialize CUDA
        cuCtxPushCurrent(*((CUcontext*) fCudaContext));
@@ -1954,19 +1024,12 @@ int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
                cudaFreeHost(fHostLockedMemory);
        }
 
-       if (CudaFailedMsg(cudaThreadExit()))
+       if (GPUFailedMsg(cudaThreadExit()))
        {
                HLTError("Could not uninitialize GPU");
                return(1);
        }
 
-       if (StopHelperThreads()) return(1);
-       pthread_mutex_destroy((pthread_mutex_t*) fHelperMemMutex);
-       free(fHelperMemMutex);
-
-       for (int i = 0;i < fgkNSlices;i++) pthread_mutex_destroy(&((pthread_mutex_t*) fSliceGlobalMutexes)[i]);
-       free(fSliceGlobalMutexes);
-
        cuCtxDestroy(*((CUcontext*) fCudaContext));
 
        cudaDeviceReset();
@@ -1976,100 +1039,6 @@ int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
        return(0);
 }
 
-void AliHLTTPCCAGPUTrackerNVCC::ResetHelperThreads(int helpers)
-{
-       HLTImportant("Error occurred, GPU tracker helper threads will be reset (Number of threads %d/%d)", fNHelperThreads, fNCPUTrackers);
-       for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
-       {
-               fHelperParams[i].fReset = true;
-               if (helpers || i >= fNHelperThreads) pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
-       }
-       HLTImportant("GPU Tracker helper threads have ben reset");
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::StopHelperThreads()
-{
-       if (fNSlaveThreads)
-       {
-               for (int i = 0;i < fNSlaveThreads;i++)
-               {
-                       fHelperParams[i].fTerminate = true;
-                       if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]))
-                       {
-                               HLTError("Error unlocking mutex to terminate slave");
-                               return(1);
-                       }
-                       if (pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]))
-                       {
-                               HLTError("Error locking mutex");
-                               return(1);
-                       }
-                       if (pthread_join( *((pthread_t*) fHelperParams[i].fThreadId), NULL))
-                       {
-                               HLTError("Error waiting for thread to terminate");
-                               return(1);
-                       }
-                       free(fHelperParams[i].fThreadId);
-                       for (int j = 0;j < 2;j++)
-                       {
-                               if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]))
-                               {
-                                       HLTError("Error unlocking mutex before destroying");
-                                       return(1);
-                               }
-                               pthread_mutex_destroy(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
-                       }
-                       free(fHelperParams[i].fMutex);
-               }
-               delete[] fHelperParams;
-       }
-       fNSlaveThreads = 0;
-       return(0);
-}
-
-void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
-{
-       //Set Output Control Pointers
-       fOutputControl = val;
-       for (int i = 0;i < fgkNSlices;i++)
-       {
-               fSlaveTrackers[i].SetOutputControl(val);
-       }
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::GetThread()
-{
-       //Get Thread ID
-#ifdef R__WIN32
-       return((int) (size_t) GetCurrentThread());
-#else
-       return((int) syscall (SYS_gettid));
-#endif
-}
-
-unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)
-{
-       //Returns pointer to PerfTimer i of slice iSlice
-       return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);
-}
-
-const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const
-{
-       //Return Pointer to Output Control Structure
-       return fOutputControl;
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const
-{
-       //Return max slice count processable
-       return(fSliceCount);
-}
-
-char* AliHLTTPCCAGPUTrackerNVCC::MergerBaseMemory()
-{
-       return(alignPointer((char*) fGPUMergerHostMemory, 1024 * 1024));
-}
-
 int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger)
 {
 #ifndef HLTCA_GPU_MERGER
@@ -2102,7 +1071,6 @@ int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger)
        AssignMemory(field, gpumem, 6);
        AssignMemory(param, gpumem, 1);
 
-
        if ((size_t) (gpumem - (char*) fGPUMergerMemory) > (size_t) fGPUMergerMaxMemory)
        {
                HLTError("Insufficiant GPU Merger Memory");
@@ -2112,25 +1080,25 @@ int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger)
 
        if (fDebugLevel >= 2) HLTInfo("Running GPU Merger (%d/%d)", Merger->NOutputTrackClusters(), Merger->NClusters());
        AliHLTTPCCATracker::StandaloneQueryTime(&a);
-       CudaFailedMsg(cudaMemcpy(X, Merger->ClusterX(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
-       CudaFailedMsg(cudaMemcpy(Y, Merger->ClusterY(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
-       CudaFailedMsg(cudaMemcpy(Z, Merger->ClusterZ(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
-       CudaFailedMsg(cudaMemcpy(Angle, Merger->ClusterAngle(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
-       CudaFailedMsg(cudaMemcpy(RowType, Merger->ClusterRowType(), Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyHostToDevice));
-       CudaFailedMsg(cudaMemcpy(tracks, Merger->OutputTracks(), Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyHostToDevice));
-       CudaFailedMsg(cudaMemcpy(field, Merger->PolinomialFieldBz(), 6 * sizeof(float), cudaMemcpyHostToDevice));
-       CudaFailedMsg(cudaMemcpy(param, fSlaveTrackers[0].pParam(), sizeof(AliHLTTPCCAParam), cudaMemcpyHostToDevice));
+       GPUFailedMsg(cudaMemcpy(X, Merger->ClusterX(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
+       GPUFailedMsg(cudaMemcpy(Y, Merger->ClusterY(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
+       GPUFailedMsg(cudaMemcpy(Z, Merger->ClusterZ(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
+       GPUFailedMsg(cudaMemcpy(Angle, Merger->ClusterAngle(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
+       GPUFailedMsg(cudaMemcpy(RowType, Merger->ClusterRowType(), Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyHostToDevice));
+       GPUFailedMsg(cudaMemcpy(tracks, Merger->OutputTracks(), Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyHostToDevice));
+       GPUFailedMsg(cudaMemcpy(field, Merger->PolinomialFieldBz(), 6 * sizeof(float), cudaMemcpyHostToDevice));
+       GPUFailedMsg(cudaMemcpy(param, fSlaveTrackers[0].pParam(), sizeof(AliHLTTPCCAParam), cudaMemcpyHostToDevice));
        AliHLTTPCCATracker::StandaloneQueryTime(&b);
        RefitTracks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(tracks, Merger->NOutputTracks(), field, X, Y, Z, RowType, Angle, param);
-       CudaFailedMsg(cudaThreadSynchronize());
+       GPUFailedMsg(cudaThreadSynchronize());
        AliHLTTPCCATracker::StandaloneQueryTime(&c);
-       CudaFailedMsg(cudaMemcpy(Merger->ClusterX(), X, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
-       CudaFailedMsg(cudaMemcpy(Merger->ClusterY(), Y, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
-       CudaFailedMsg(cudaMemcpy(Merger->ClusterZ(), Z, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
-       CudaFailedMsg(cudaMemcpy(Merger->ClusterAngle(), Angle, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
-       CudaFailedMsg(cudaMemcpy(Merger->ClusterRowType(), RowType, Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-       CudaFailedMsg(cudaMemcpy((void*) Merger->OutputTracks(), tracks, Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyDeviceToHost));
-       CudaFailedMsg(cudaThreadSynchronize());
+       GPUFailedMsg(cudaMemcpy(Merger->ClusterX(), X, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
+       GPUFailedMsg(cudaMemcpy(Merger->ClusterY(), Y, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
+       GPUFailedMsg(cudaMemcpy(Merger->ClusterZ(), Z, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
+       GPUFailedMsg(cudaMemcpy(Merger->ClusterAngle(), Angle, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
+       GPUFailedMsg(cudaMemcpy(Merger->ClusterRowType(), RowType, Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyDeviceToHost));
+       GPUFailedMsg(cudaMemcpy((void*) Merger->OutputTracks(), tracks, Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyDeviceToHost));
+       GPUFailedMsg(cudaThreadSynchronize());
        AliHLTTPCCATracker::StandaloneQueryTime(&d);
        if (fDebugLevel >= 2) HLTInfo("GPU Merger Finished");
 
@@ -2149,9 +1117,23 @@ int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger)
 #endif
 }
 
-int AliHLTTPCCAGPUTrackerNVCC::IsInitialized()
+int AliHLTTPCCAGPUTrackerNVCC::GPUMergerAvailable()
+{
+       return(1);
+}
+
+void AliHLTTPCCAGPUTrackerNVCC::ActivateThreadContext()
 {
-       return(fCudaInitialized);
+       cuCtxPushCurrent(*((CUcontext*) fCudaContext));
+}
+void AliHLTTPCCAGPUTrackerNVCC::ReleaseThreadContext()
+{
+       cuCtxPopCurrent((CUcontext*) fCudaContext);
+}
+
+void AliHLTTPCCAGPUTrackerNVCC::SynchronizeGPU()
+{
+       cudaThreadSynchronize();
 }
 
 AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch
deleted file mode 100755 (executable)
index 3202e11..0000000
+++ /dev/null
@@ -1,122 +0,0 @@
---- AliHLTTPCCAGPUTracker.cucpp        2009-05-28 12:14:09.000000000 +0200 
-+++ release/x86_64-pc-linux-gnu/code/AliHLTTPCCAGPUTracker.cucpp 2009-05-28 12:10:25.000000000 +0200
-@@ -1530,10 +1530,10 @@
- extern "C" { extern int getdate_err; }
- extern "C" tm *getdate(const char *);
- extern "C" int getdate_r(const char *__restrict__, tm *__restrict__);
--extern "C" { extern inline __attribute__((__weak__)) void *memcpy(void *__restrict__, const void *__restrict__, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); }
-+extern "C" { extern inline void *memcpy(void *__restrict__, const void *__restrict__, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); }
- extern "C" { extern inline void *memmove(void *, const void *, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); }
- extern "C" void *memccpy(void *__restrict__, const void *__restrict__, int, size_t) throw() __attribute__((nonnull(1))) __attribute__((nonnull(2)));
--extern "C" { extern inline __attribute__((__weak__)) void *memset(void *, int, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))); }
-+extern "C" { extern inline void *memset(void *, int, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))); }
- extern "C" int memcmp(const void *, const void *, size_t) throw() __attribute__((__pure__)) __attribute__((nonnull(1))) __attribute__((nonnull(2)));
- extern inline void *memchr(void *, int, size_t) throw() __asm__("memchr") __attribute__((__pure__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1)));
- extern inline const void *memchr(const void *, int, size_t) throw() __asm__("memchr") __attribute__((__pure__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1)));
-@@ -1661,7 +1661,7 @@
- char *basename(char *) throw() __asm__("basename") __attribute__((nonnull(1)));
- const char *basename(const char *) throw() __asm__("basename") __attribute__((nonnull(1)));
- extern "C" void __warn_memset_zero_len();
--extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))) void *memcpy(void *__restrict__ __dest, const void *__restrict__ __src, size_t __len) throw()
-+extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))) void *memcpy(void *__restrict__ __dest, const void *__restrict__ __src, size_t __len) throw()
- {
- return __builtin___memcpy_chk(__dest, __src, __len, __builtin_object_size(__dest, 0));
- } }
-@@ -1673,7 +1673,7 @@
- {
- return __builtin___mempcpy_chk(__dest, __src, __len, __builtin_object_size(__dest, 0));
- } }
--extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) void *memset(void *__dest, int __ch, size_t __len) throw()
-+extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) void *memset(void *__dest, int __ch, size_t __len) throw()
- {
- if (((0) && (__len == (0))) && ((!(0)) || (__ch != 0)))
- {
-@@ -1719,8 +1719,6 @@
- return __builtin___strncat_chk(__dest, __src, __len, __builtin_object_size(__dest, 2 > 1));
- } }
- extern "C" __attribute__((__weak__)) clock_t clock() throw();
--extern "C" { extern inline __attribute__((__weak__)) void *memset(void *, int, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))); }
--extern "C" { extern inline __attribute__((__weak__)) void *memcpy(void *, const void *, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); }
- extern "C" __attribute__((__weak__)) int abs(int) throw() __attribute__((__warn_unused_result__)) __attribute__((__const__));
- extern "C" __attribute__((__weak__)) long labs(long) throw() __attribute__((__warn_unused_result__)) __attribute__((__const__));
- extern "C" __attribute__((__weak__)) long long llabs(long long) throw() __attribute__((__warn_unused_result__)) __attribute__((__const__));
-@@ -1862,11 +1860,8 @@
- extern "C" __attribute__((__weak__)) int __isnanf(float) throw() __attribute__((__const__));
- extern "C" __attribute__((__weak__)) int __finite(double) throw() __attribute__((__const__));
- extern "C" __attribute__((__weak__)) int __finitef(float) throw() __attribute__((__const__));
--extern "C" { extern inline __attribute__((__weak__)) int __signbit(double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
--extern "C" { extern inline __attribute__((__weak__)) int __signbitf(float) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
- extern "C" __attribute__((__weak__)) double fma(double, double, double) throw();
- extern "C" __attribute__((__weak__)) float fmaf(float, float, float) throw();
--extern "C" { extern inline __attribute__((__weak__)) int __signbitl(long double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
- extern "C" __attribute__((__weak__)) int __isinfl(long double) throw() __attribute__((__const__));
- extern "C" __attribute__((__weak__)) int __isnanl(long double) throw() __attribute__((__const__));
- extern "C" __attribute__((__weak__)) int __finitel(long double) throw() __attribute__((__const__));
-@@ -1948,7 +1943,7 @@
- extern "C" __attribute__((__weak__)) double fmax(double, double) throw(); extern "C" double __fmax(double, double) throw();
- extern "C" __attribute__((__weak__)) double fmin(double, double) throw(); extern "C" double __fmin(double, double) throw();
- extern "C" int __fpclassify(double) throw() __attribute__((__const__));
--extern "C" { extern inline __attribute__((__weak__)) int __signbit(double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
-+extern "C" { extern inline int __signbit(double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
- extern "C" __attribute__((__weak__)) double fma(double, double, double) throw(); extern "C" double __fma(double, double, double) throw();
- extern "C" double scalb(double, double) throw(); extern "C" double __scalb(double, double) throw();
- extern "C" __attribute__((__weak__)) float acosf(float) throw(); extern "C" float __acosf(float) throw();
-@@ -2027,7 +2022,7 @@
- extern "C" __attribute__((__weak__)) float fmaxf(float, float) throw(); extern "C" float __fmaxf(float, float) throw();
- extern "C" __attribute__((__weak__)) float fminf(float, float) throw(); extern "C" float __fminf(float, float) throw();
- extern "C" int __fpclassifyf(float) throw() __attribute__((__const__));
--extern "C" { extern inline __attribute__((__weak__)) int __signbitf(float) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
-+extern "C" { extern inline int __signbitf(float) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
- extern "C" __attribute__((__weak__)) float fmaf(float, float, float) throw(); extern "C" float __fmaf(float, float, float) throw();
- extern "C" float scalbf(float, float) throw(); extern "C" float __scalbf(float, float) throw();
- extern "C" long double acosl(long double) throw(); extern "C" long double __acosl(long double) throw();
-@@ -2106,7 +2101,7 @@
- extern "C" long double fmaxl(long double, long double) throw(); extern "C" long double __fmaxl(long double, long double) throw();
- extern "C" long double fminl(long double, long double) throw(); extern "C" long double __fminl(long double, long double) throw();
- extern "C" int __fpclassifyl(long double) throw() __attribute__((__const__));
--extern "C" { extern inline __attribute__((__weak__)) int __signbitl(long double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
-+extern "C" { extern inline int __signbitl(long double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
- extern "C" long double fmal(long double, long double, long double) throw(); extern "C" long double __fmal(long double, long double, long double) throw();
- extern "C" long double scalbl(long double, long double) throw(); extern "C" long double __scalbl(long double, long double) throw();
- extern "C" { extern int signgam; }
-@@ -2134,19 +2129,19 @@
- double retval;
- }; }
- extern "C" int matherr(__exception *) throw();
--extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitf(float __x) throw()
-+extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitf(float __x) throw()
- {
- int __m;
- __asm__("pmovmskb %1, %0" : "=r" (__m) : "x" (__x));
- return __m & 8;
- } }
--extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbit(double __x) throw()
-+extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbit(double __x) throw()
- {
- int __m;
- __asm__("pmovmskb %1, %0" : "=r" (__m) : "x" (__x));
- return __m & 128;
- } }
--extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitl(long double __x) throw()
-+extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitl(long double __x) throw()
- {
- union { long double __l; int __i[3]; } __u = {__l: __x};
- return (((__u.__i)[2]) & 32768) != 0;
-@@ -9864,7 +9859,7 @@
- {
- __c_locale __old = __gnu_cxx::__uselocale(__cloc);
- __builtin_va_list __args;
--__builtin_stdarg_start(__args,__fmt);
-+__builtin_va_start(__args,__fmt);
- const int __ret = __builtin_vsnprintf(__out, __size, __fmt, __args);
- __builtin_va_end(__args);
- __gnu_cxx::__uselocale(__old);
-@@ -23186,7 +23186,7 @@
- static T2 *Alloc(int s) { auto T2 *p = (reinterpret_cast< T2 *>(_mm_malloc(s * sizeof(CacheLineSizeHelper< T> ), 128))); return new (p) T2 [s]; }
- static void Free(T2 *const p, int size) {
- for (int i = 0; i < size; ++i) {
--((p[i]).~CacheLineSizeHelper());
-+((p[i]).~T2());
- }
- _mm_free(p);
- } 
index 11e7d84..073aa5c 100755 (executable)
 #ifndef ALIHLTTPCCAGPUTRACKERNVCC_H
 #define ALIHLTTPCCAGPUTRACKERNVCC_H
 
-#include "AliHLTTPCCAGPUTracker.h"
-#include "AliHLTTPCCADef.h"
-#include "AliHLTTPCCATracker.h"
-#include "AliHLTLogging.h"
-#include "AliHLTTPCCASliceOutput.h"
+#include "AliHLTTPCCAGPUTrackerBase.h"
 
-#ifdef __CINT__
-typedef int cudaError_t
-#elif defined(R__WIN32)
-#include "../cmodules/pthread_mutex_win32_wrapper.h"
-#else
-#include <pthread.h>
-#include <errno.h>
-#endif
-
-class AliHLTTPCCARow;
-
-class AliHLTTPCCAGPUTrackerNVCC : public AliHLTTPCCAGPUTracker, public AliHLTLogging
+class AliHLTTPCCAGPUTrackerNVCC : public AliHLTTPCCAGPUTrackerBase
 {
-       friend void* helperWrapper(void*);
 public:
        AliHLTTPCCAGPUTrackerNVCC();
        virtual ~AliHLTTPCCAGPUTrackerNVCC();
 
-       virtual int InitGPU(int sliceCount = -1, int forceDeviceID = -1);
-       virtual int IsInitialized();
+       virtual int InitGPU_Runtime(int sliceCount = -1, int forceDeviceID = -1);
        virtual int Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
-       int ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
-       int SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
-       virtual int ExitGPU();
-
-       virtual void SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile = NULL);
-       virtual int SetGPUTrackerOption(char* OptionName, int OptionValue);
-
-       virtual unsigned long long int* PerfTimer(int iSlice, unsigned int i);
-
-       virtual int InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param);
-       virtual void SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val);
-
-       virtual const AliHLTTPCCASliceOutput::outputControlStruct* OutputControl() const;
-       virtual int GetSliceCount() const;
-
+       virtual int ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
+       virtual int ExitGPU_Runtime();
        virtual int RefitMergedTracks(AliHLTTPCGMMerger* Merger);
-       virtual char* MergerBaseMemory();
-
-private:
-       struct helperParam
-       {
-               void* fThreadId;
-               AliHLTTPCCAGPUTrackerNVCC* fCls;
-               int fNum;
-               int fSliceCount;
-               AliHLTTPCCAClusterData* pClusterData;
-               AliHLTTPCCASliceOutput** pOutput;
-               int fFirstSlice;
-               void* fMutex;
-               bool fTerminate;
-               int fPhase;
-               int CPUTracker;
-               volatile int fDone;
-               volatile bool fReset;
-       };
-
-       static void* RowMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + iSlice * sizeof(AliHLTTPCCARow) * (HLTCA_ROW_COUNT + 1) ); }
-       static void* CommonMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + iSlice * AliHLTTPCCATracker::CommonMemorySize() ); }
-       static void* SliceDataMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + iSlice * HLTCA_GPU_SLICE_DATA_MEMORY ); }
-       void* GlobalMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_GLOBAL_MEMORY ); }
-       void* TracksMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_TRACKS_MEMORY ); }
-       void* TrackerMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + iSlice * sizeof(AliHLTTPCCATracker) ); }
-       
-       void ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId);
-       void WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId);
-       int GlobalTracking(int iSlice, int threadId, helperParam* hParam);
+       virtual int GPUMergerAvailable();
 
-       int StartHelperThreads();
-       int StopHelperThreads();
-       void ResetHelperThreads(int helpers);
-       void ResetThisHelperThread(AliHLTTPCCAGPUTrackerNVCC::helperParam* par);
+protected:
+       virtual void ActivateThreadContext();
+       virtual void ReleaseThreadContext();
+       virtual void SynchronizeGPU();
+       virtual int GPUSync(char* state = "UNKNOWN", int stream = -1, int slice = 0);
 
+private:
        void DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check = true);
-       int GetThread();
-       void ReleaseGlobalLock(void* sem);
-       int CheckMemorySizes(int sliceCount);
-
-       int CUDASync(char* state = "UNKNOWN", int sliceLocal = 0, int slice = 0);
-       template <class T> T* alignPointer(T* ptr, int alignment);
-       void StandalonePerfTime(int iSlice, int i);
-#define CudaFailedMsg(x) CudaFailedMsgA(x, __FILE__, __LINE__)
-       bool CudaFailedMsgA(cudaError_t error, const char* file, int line);
-       
-       static void* helperWrapper(void*);
-
-       AliHLTTPCCATracker *fGpuTracker; //Tracker Objects that will be used on the GPU
-       void* fGPUMemory; //Pointer to GPU Memory Base Adress
-       void* fHostLockedMemory; //Pointer to Base Adress of Page Locked Host Memory for DMA Transfer
-
-       void* fGPUMergerMemory;
-       void* fGPUMergerHostMemory;
-       int fGPUMergerMaxMemory;
-
-       int fDebugLevel;                        //Debug Level for GPU Tracker
-       unsigned int fDebugMask;        //Mask which Debug Data is written to file
-       std::ostream* fOutFile;         //Debug Output Stream Pointer
-       unsigned long long int fGPUMemSize;     //Memory Size to allocate on GPU
-
-       void* fpCudaStreams; //Pointer to array of CUDA Streams
-       int fSliceCount; //Maximum Number of Slices this GPU tracker can process in parallel
-       int fCudaDevice; //CUDA device used by GPU tracker
-
-       static const int fgkNSlices = 36; //Number of Slices in Alice
-       AliHLTTPCCATracker fSlaveTrackers[fgkNSlices]; //CPU Slave Trackers for Initialization and Output
-
-       AliHLTTPCCASliceOutput::outputControlStruct* fOutputControl; //Output Control Structure
-       
-       int fThreadId; //Thread ID that is valid for the local CUDA context
-       int fCudaInitialized; //Flag if CUDA is initialized
-
-       int fPPMode; //Flag if GPU tracker runs in PP Mode
-       int fSelfheal; //Reinitialize GPU on failure
-
-       int fConstructorBlockCount; //GPU blocks used in Tracklet Constructor
-       int selectorBlockCount; //GPU blocks used in Tracklet Selector
-       
-#ifdef HLTCA_GPU_TIME_PROFILE
-       unsigned long long int fProfTimeC, fProfTimeD; //Timing
-#endif
-
        void* fCudaContext; //Pointer to CUDA context
-       
-       int fNHelperThreads; //Number of helper threads for post/preprocessing
-       helperParam* fHelperParams; //Control Struct for helper threads
-       void* fHelperMemMutex;
-       
-#ifdef __ROOT__
-#define volatile
-#endif
-       volatile int fSliceOutputReady;
-       volatile char fSliceLeftGlobalReady[fgkNSlices];
-       volatile char fSliceRightGlobalReady[fgkNSlices];
-#ifdef __ROOT__
-#undef volatile
-#endif
-       void* fSliceGlobalMutexes;
-       char fGlobalTrackingDone[fgkNSlices];
-       char fWriteOutputDone[fgkNSlices];
+       bool GPUFailedMsgA(cudaError_t error, const char* file, int line);
 
-       int fNCPUTrackers; //Number of CPU trackers to use
-       int fNSlicesPerCPUTracker; //Number of slices processed by each CPU tracker
-
-       int fGlobalTracking; //Use Global Tracking
-       int fUseGlobalTracking; 
-
-       int fNSlaveThreads;     //Number of slave threads currently active
+       void* fpCudaStreams; //Pointer to array of CUDA Streams
 
        // disable copy
        AliHLTTPCCAGPUTrackerNVCC( const AliHLTTPCCAGPUTrackerNVCC& );
diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cl b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cl
new file mode 100644 (file)
index 0000000..4e7a63f
--- /dev/null
@@ -0,0 +1,113 @@
+#define __OPENCL__
+#define RADEON
+
+//Disable assertions since they produce errors in GPU Code
+#ifdef assert
+#undef assert
+#endif
+#define assert(param)
+
+#include "AliHLTTPCCATrackParam.cxx"
+#include "AliHLTTPCCATrack.cxx" 
+
+#include "AliHLTTPCCAHitArea.cxx"
+#include "AliHLTTPCCAGrid.cxx"
+#include "AliHLTTPCCARow.cxx"
+#include "AliHLTTPCCAParam.cxx"
+#include "AliHLTTPCCATracker.cxx"
+
+#include "AliHLTTPCCATrackletSelector.cxx"
+#include "AliHLTTPCCANeighboursFinder.cxx"
+#include "AliHLTTPCCANeighboursCleaner.cxx"
+#include "AliHLTTPCCAStartHitsFinder.cxx"
+#include "AliHLTTPCCAStartHitsSorter.cxx"
+#include "AliHLTTPCCATrackletConstructor.cxx"
+
+__kernel void PreInitRowBlocks(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int iSlice)
+{
+       GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[iSlice];
+       if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return;
+
+       //Initialize GPU RowBlocks and HitWeights
+       const int nSliceDataHits = pTracker.Data().NumberOfHitsPlusAlign();
+       __global int4* SliceDataHitWeights4 = (__global int4*) pTracker.Data().HitWeights();
+
+       const int stride = get_global_size(0);
+       int4 i0;
+       i0.x = i0.y = i0.z = i0.w = 0;
+       for (int i = get_global_id(0);i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
+               SliceDataHitWeights4[i] = i0;
+}
+
+GPUg() void AliHLTTPCCAProcess_AliHLTTPCCANeighboursFinder(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int iSlice)
+{
+  GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[iSlice];
+  if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return;
+  GPUshared() typename AliHLTTPCCANeighboursFinder::MEM_LOCAL(AliHLTTPCCASharedMemory) smem;
+
+  for( int iSync=0; iSync<=AliHLTTPCCANeighboursFinder::NThreadSyncPoints(); iSync++){
+    GPUsync();
+    AliHLTTPCCANeighboursFinder::Thread( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), iSync, smem, pTracker  );
+  }
+}
+
+GPUg() void AliHLTTPCCAProcess_AliHLTTPCCANeighboursCleaner(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int iSlice)
+{
+  GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[iSlice];
+  if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return;
+  GPUshared() typename AliHLTTPCCANeighboursCleaner::MEM_LOCAL(AliHLTTPCCASharedMemory) smem;
+
+  for( int iSync=0; iSync<=AliHLTTPCCANeighboursCleaner::NThreadSyncPoints(); iSync++){
+    GPUsync();
+    AliHLTTPCCANeighboursCleaner::Thread( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), iSync, smem, pTracker  );
+  }
+}
+
+GPUg() void AliHLTTPCCAProcess_AliHLTTPCCAStartHitsFinder(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int iSlice)
+{
+  GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[iSlice];
+  if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return;
+  GPUshared() typename AliHLTTPCCAStartHitsFinder::MEM_LOCAL(AliHLTTPCCASharedMemory) smem;
+
+  for( int iSync=0; iSync<=AliHLTTPCCAStartHitsFinder::NThreadSyncPoints(); iSync++){
+    GPUsync();
+    AliHLTTPCCAStartHitsFinder::Thread( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), iSync, smem, pTracker  );
+  }
+}
+
+GPUg() void AliHLTTPCCAProcess_AliHLTTPCCAStartHitsSorter(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int iSlice)
+{
+  GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[iSlice];
+  if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return;
+  GPUshared() typename AliHLTTPCCAStartHitsSorter::MEM_LOCAL(AliHLTTPCCASharedMemory) smem;
+
+  for( int iSync=0; iSync<=AliHLTTPCCAStartHitsSorter::NThreadSyncPoints(); iSync++){
+    GPUsync();
+    AliHLTTPCCAStartHitsSorter::Thread( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), iSync, smem, pTracker  );
+  }
+}
+
+GPUg() void AliHLTTPCCAProcessMulti_AliHLTTPCCATrackletSelector(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int firstSlice, int nSliceCount)
+{
+  const int iSlice = nSliceCount * (get_group_id(0) + (get_num_groups(0) % nSliceCount != 0 && nSliceCount * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0);
+  const int nSliceBlockOffset = get_num_groups(0) * iSlice / nSliceCount;
+  const int sliceBlockId = get_group_id(0) - nSliceBlockOffset;
+  const int sliceGridDim = get_num_groups(0) * (iSlice + 1) / nSliceCount - get_num_groups(0) * (iSlice) / nSliceCount;
+  GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[firstSlice + iSlice];
+  if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return;
+  GPUshared() typename AliHLTTPCCATrackletSelector::MEM_LOCAL(AliHLTTPCCASharedMemory) smem;
+
+  for( int iSync=0; iSync<=AliHLTTPCCATrackletSelector::NThreadSyncPoints(); iSync++){
+    GPUsync();
+    AliHLTTPCCATrackletSelector::Thread( sliceGridDim, get_local_size(0), sliceBlockId, get_local_id(0), iSync, smem, pTracker  );
+  }
+}
+
+GPUg() void AliHLTTPCCATrackletConstructorGPU(__global char* gpu_mem, GPUconstant() void* pTrackerTmp)
+{
+       //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU
+       GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) *pTracker = ( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp ;
+       if (gpu_mem != pTracker[0].GPUParametersConst()->fGPUMem) return;
+       GPUshared() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory) sMem;
+       AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(pTracker, sMem);
+}
diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cxx b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cxx
new file mode 100644 (file)
index 0000000..ba497ba
--- /dev/null
@@ -0,0 +1,810 @@
+// **************************************************************************
+// This file is property of and copyright by the ALICE HLT Project          *
+// ALICE Experiment at CERN, All rights reserved.                           *
+//                                                                          *
+// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
+//                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
+//                                     David Rohr <drohr@kip.uni-heidelberg.de>                                *
+//                  for The ALICE HLT Project.                              *
+//                                                                          *
+// Permission to use, copy, modify and distribute this software and its     *
+// documentation strictly for non-commercial purposes is hereby granted     *
+// without fee, provided that the above copyright notice appears in all     *
+// copies and that both the copyright notice and this permission notice     *
+// appear in the supporting documentation. The authors make no claims       *
+// about the suitability of this software for any purpose. It is            *
+// provided "as is" without express or implied warranty.                    *
+//                                                                          *
+//***************************************************************************
+
+#define __OPENCL__
+#define RADEON
+#define HLTCA_HOSTCODE
+
+#include <string.h>
+#include "AliHLTTPCCAGPUTrackerOpenCL.h"
+#include "AliHLTTPCCAGPUTrackerOpenCLInternals.h"
+#include "AliHLTTPCCAGPUTrackerCommon.h"
+
+#include "AliHLTTPCCATrackParam.h"
+#include "AliHLTTPCCATrack.h" 
+
+#include "AliHLTTPCCAHitArea.h"
+#include "AliHLTTPCCAGrid.h"
+#include "AliHLTTPCCARow.h"
+#include "AliHLTTPCCAParam.h"
+#include "AliHLTTPCCATracker.h"
+
+#include "AliHLTTPCCAProcess.h"
+
+#include "AliHLTTPCCATrackletSelector.h"
+#include "AliHLTTPCCANeighboursFinder.h"
+#include "AliHLTTPCCANeighboursCleaner.h"
+#include "AliHLTTPCCAStartHitsFinder.h"
+#include "AliHLTTPCCAStartHitsSorter.h"
+#include "AliHLTTPCCATrackletConstructor.h"
+#include "AliHLTTPCCAClusterData.h"
+
+#include "../makefiles/opencl_obtain_program.h"
+extern "C" char _makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl[];
+
+ClassImp( AliHLTTPCCAGPUTrackerOpenCL )
+
+AliHLTTPCCAGPUTrackerOpenCL::AliHLTTPCCAGPUTrackerOpenCL() : ocl(NULL)
+{
+       ocl = new AliHLTTPCCAGPUTrackerOpenCLInternals;
+       if (ocl == NULL)
+       {
+               HLTError("Memory Allocation Error");
+       }
+       ocl->mem_host_ptr = NULL;
+       ocl->selector_events = NULL;
+       ocl->devices = NULL;
+};
+
+AliHLTTPCCAGPUTrackerOpenCL::~AliHLTTPCCAGPUTrackerOpenCL()
+{
+       delete[] ocl;
+};
+
+#define quit(msg) {HLTError(msg);return(1);} 
+
+int AliHLTTPCCAGPUTrackerOpenCL::InitGPU_Runtime(int sliceCount, int forceDeviceID)
+{
+       //Find best OPENCL device, initialize and allocate memory
+
+       cl_int ocl_error;
+       cl_uint num_platforms;
+       if (clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS) quit("Error getting OpenCL Platform Count");
+       if (num_platforms == 0) quit("No OpenCL Platform found");
+       if (fDebugLevel >= 2) HLTInfo("%d OpenCL Platforms found", num_platforms);
+       
+       //Query platforms
+       cl_platform_id* platforms = new cl_platform_id[num_platforms];
+       if (platforms == NULL) quit("Memory allocation error");
+       if (clGetPlatformIDs(num_platforms, platforms, NULL) != CL_SUCCESS) quit("Error getting OpenCL Platforms");
+
+       cl_platform_id platform;
+       bool found = false;
+       for (unsigned int i_platform = 0;i_platform < num_platforms;i_platform++)
+       {
+               char platform_profile[64], platform_version[64], platform_name[64], platform_vendor[64];
+               clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_PROFILE, 64, platform_profile, NULL);
+               clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VERSION, 64, platform_version, NULL);
+               clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_NAME, 64, platform_name, NULL);
+               clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VENDOR, 64, platform_vendor, NULL);
+               if (fDebugLevel >= 2) {HLTDebug("Available Platform %d: (%s %s) %s %s\n", i_platform, platform_profile, platform_version, platform_vendor, platform_name);}
+               if (strcmp(platform_vendor, "Advanced Micro Devices, Inc.") == 0)
+               {
+                       found = true;
+                       if (fDebugLevel >= 2) HLTInfo("AMD OpenCL Platform found");
+                       platform = platforms[i_platform];
+                       break;
+               }
+       }
+       if (found == false)
+       {
+               HLTError("Did not find AMD OpenCL Platform");
+               return(1);
+       }
+
+       cl_uint count, bestDevice = (cl_uint) -1;
+       long long int bestDeviceSpeed = 0, deviceSpeed;
+       if (GPUFailedMsg(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &count)))
+       {
+               HLTError("Error getting OPENCL Device Count");
+               return(1);
+       }
+
+       //Query devices
+       ocl->devices = new cl_device_id[count];
+       if (ocl->devices == NULL) quit("Memory allocation error");
+       if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, count, ocl->devices, NULL) != CL_SUCCESS) quit("Error getting OpenCL devices"); 
+
+       char device_vendor[64], device_name[64];
+       cl_device_type device_type;
+       cl_uint freq, shaders;
+
+       if (fDebugLevel >= 2) HLTInfo("Available OPENCL devices:");
+       for (unsigned int i = 0;i < count;i++)
+       {
+               if (fDebugLevel >= 3) {HLTDebug("Examining device %d\n", i);}
+               cl_uint nbits;
+
+               clGetDeviceInfo(ocl->devices[i], CL_DEVICE_NAME, 64, device_name, NULL);
+               clGetDeviceInfo(ocl->devices[i], CL_DEVICE_VENDOR, 64, device_vendor, NULL);
+               clGetDeviceInfo(ocl->devices[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL);
+               clGetDeviceInfo(ocl->devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL);
+               clGetDeviceInfo(ocl->devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(shaders), &shaders, NULL);
+               clGetDeviceInfo(ocl->devices[i], CL_DEVICE_ADDRESS_BITS, sizeof(nbits), &nbits, NULL);
+               //if (device_type & CL_DEVICE_TYPE_CPU) continue;
+               //if (!(device_type & CL_DEVICE_TYPE_GPU)) continue;
+               if (nbits / 8 != sizeof(void*)) continue;
+
+               deviceSpeed = (long long int) freq * (long long int) shaders;
+               if (device_type & CL_DEVICE_TYPE_GPU) deviceSpeed *= 10;
+               if (fDebugLevel >= 2) {HLTDebug("Found Device %d: %s %s (Frequency %d, Shaders %d, %d bit) (Speed Value: %lld)\n", i, device_vendor, device_name, (int) freq, (int) shaders, (int) nbits, (long long int) deviceSpeed);}
+
+               if (deviceSpeed > bestDeviceSpeed)
+               {
+                       bestDevice = i;
+                       bestDeviceSpeed = deviceSpeed;
+               }
+       }
+       if (bestDevice == (cl_uint) -1)
+       {
+               HLTWarning("No %sOPENCL Device available, aborting OPENCL Initialisation", count ? "appropriate " : "");
+               return(1);
+       }
+
+       if (forceDeviceID > -1 && forceDeviceID < (signed) count) bestDevice = forceDeviceID;
+       ocl->device = ocl->devices[bestDevice];
+
+       clGetDeviceInfo(ocl->device, CL_DEVICE_NAME, 64, device_name, NULL);
+       clGetDeviceInfo(ocl->device, CL_DEVICE_VENDOR, 64, device_vendor, NULL);
+       clGetDeviceInfo(ocl->device, CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL);
+       clGetDeviceInfo(ocl->device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL);
+       clGetDeviceInfo(ocl->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(shaders), &shaders, NULL);
+       if (fDebugLevel >= 2) {HLTDebug("Using OpenCL device %d: %s %s (Frequency %d, Shaders %d)\n", bestDevice, device_vendor, device_name, (int) freq, (int) shaders);}
+
+       cl_uint compute_units;
+       clGetDeviceInfo(ocl->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &compute_units, NULL);
+       
+       fConstructorBlockCount = compute_units * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER;
+       selectorBlockCount = compute_units * HLTCA_GPU_BLOCK_COUNT_SELECTOR_MULTIPLIER;
+
+       ocl->context = clCreateContext(NULL, count, ocl->devices, NULL, NULL, &ocl_error);
+       if (ocl_error != CL_SUCCESS)
+       {
+               HLTError("Could not create OPENCL Device Context!");
+               return(1);
+       }
+
+       //Workaround to compile CL kernel during tracker initialization
+       /*{
+               char* file = "cagpubuild/AliHLTTPCCAGPUTrackerOpenCL.cl";
+               HLTDebug("Reading source file %s\n", file);
+               FILE* fp = fopen(file, "rb");
+               if (fp == NULL)
+               {
+                       HLTDebug("Cannot open %s\n", file);
+                       return(1);
+               }
+               fseek(fp, 0, SEEK_END);
+               size_t file_size = ftell(fp);
+               fseek(fp, 0, SEEK_SET);
+
+               char* buffer = (char*) malloc(file_size + 1);
+               if (buffer == NULL)
+               {
+                       quit("Memory allocation error");
+               }
+               if (fread(buffer, 1, file_size, fp) != file_size)
+               {
+                       quit("Error reading file");
+               }
+               buffer[file_size] = 0;
+               fclose(fp);
+
+               HLTDebug("Creating OpenCL Program Object\n");
+               //Create OpenCL program object
+               ocl->program = clCreateProgramWithSource(ocl->context, (cl_uint) 1, (const char**) &buffer, NULL, &ocl_error);
+               if (ocl_error != CL_SUCCESS) quit("Error creating program object");
+
+               HLTDebug("Compiling OpenCL Program\n");
+               //Compile program
+               ocl_error = clBuildProgram(ocl->program, count, ocl->devices, "-I. -Iinclude -Icode -Ibase -Imerger-ca -Icagpubuild -I/home/qon/AMD-APP-SDK-v2.8.1.0-RC-lnx64/include -I/usr/local/cuda/include -DHLTCA_STANDALONE -DBUILD_GPU -D_64BIT  -x clc++", NULL, NULL);
+               if (ocl_error != CL_SUCCESS)
+               {
+                       HLTDebug("OpenCL Error while building program: %d (Compiler options: %s)\n", ocl_error, "");
+
+                       for (unsigned int i = 0;i < count;i++)
+                       {
+                               cl_build_status status;
+                               clGetProgramBuildInfo(ocl->program, ocl->devices[i], CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL);
+                               if (status == CL_BUILD_ERROR)
+                               {
+                                       size_t log_size;
+                                       clGetProgramBuildInfo(ocl->program, ocl->devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+                                       char* build_log = (char*) malloc(log_size + 1);
+                                       if (build_log == NULL) quit("Memory allocation error");
+                                       clGetProgramBuildInfo(ocl->program, ocl->devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
+                                       HLTDebug("Build Log (device %d):\n\n%s\n\n", i, build_log);
+                                       free(build_log);
+                               }
+                       }
+               }
+       }*/
+
+       if (_makefiles_opencl_obtain_program_helper(ocl->context, count, ocl->devices, &ocl->program, _makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl))
+       {
+               clReleaseContext(ocl->context);
+               HLTError("Could not obtain OpenCL progarm");
+               return(1);
+       }
+       if (fDebugLevel >= 2) HLTInfo("OpenCL program loaded successfully");
+
+       ocl->kernel_row_blocks = clCreateKernel(ocl->program, "PreInitRowBlocks", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 1");return(1);}
+       ocl->kernel_neighbours_finder = clCreateKernel(ocl->program, "AliHLTTPCCAProcess_AliHLTTPCCANeighboursFinder", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 1");return(1);}
+       ocl->kernel_neighbours_cleaner = clCreateKernel(ocl->program, "AliHLTTPCCAProcess_AliHLTTPCCANeighboursCleaner", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 2");return(1);}
+       ocl->kernel_start_hits_finder = clCreateKernel(ocl->program, "AliHLTTPCCAProcess_AliHLTTPCCAStartHitsFinder", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 3");return(1);}
+       ocl->kernel_start_hits_sorter = clCreateKernel(ocl->program, "AliHLTTPCCAProcess_AliHLTTPCCAStartHitsSorter", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 4");return(1);}
+       ocl->kernel_tracklet_selector = clCreateKernel(ocl->program, "AliHLTTPCCAProcessMulti_AliHLTTPCCATrackletSelector", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 5");return(1);}
+       ocl->kernel_tracklet_constructor = clCreateKernel(ocl->program, "AliHLTTPCCATrackletConstructorGPU", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 6");return(1);}
+       if (fDebugLevel >= 2) HLTInfo("OpenCL kernels created successfully");
+
+       ocl->mem_gpu = clCreateBuffer(ocl->context, CL_MEM_READ_WRITE, fGPUMemSize, NULL, &ocl_error);
+       if (ocl_error != CL_SUCCESS)
+       {
+               HLTError("OPENCL Memory Allocation Error");
+               clReleaseContext(ocl->context);
+               return(1);
+       }
+
+       ocl->mem_constant = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY, HLTCA_GPU_TRACKER_CONSTANT_MEM, NULL, &ocl_error);
+       if (ocl_error != CL_SUCCESS)
+       {
+               HLTError("OPENCL Constant Memory Allocation Error");
+               clReleaseMemObject(ocl->mem_gpu);
+               clReleaseContext(ocl->context);
+               return(1);
+       }
+
+       int nStreams = CAMath::Max(3, fSliceCount);
+       if (nStreams > 36)
+       {
+               HLTError("Uhhh, more than 36 command queues requested, cannot do this. Did the TPC become larger?");
+               return(1);
+       }
+       for (int i = 0;i < nStreams;i++)
+       {
+               ocl->command_queue[i] = clCreateCommandQueue(ocl->context, ocl->device, 0, &ocl_error);
+               if (ocl_error != CL_SUCCESS) quit("Error creating OpenCL command queue");
+       }
+       if (clEnqueueMigrateMemObjects(ocl->command_queue[0], 1, &ocl->mem_gpu, 0, 0, NULL, NULL) != CL_SUCCESS) quit("Error migrating buffer");
+
+       if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
+       int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
+
+       ocl->mem_host = clCreateBuffer(ocl->context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, hostMemSize, NULL, &ocl_error);
+       if (ocl_error != CL_SUCCESS) quit("Error allocating pinned host memory");
+
+       const char* krnlGetPtr = "__kernel void krnlGetPtr(__global char* gpu_mem, __global size_t* host_mem) {if (get_global_id(0) == 0) *host_mem = (size_t) gpu_mem;}";
+       cl_program program = clCreateProgramWithSource(ocl->context, 1, (const char**) &krnlGetPtr, NULL, &ocl_error);
+       if (ocl_error != CL_SUCCESS) quit("Error creating program object");
+       ocl_error = clBuildProgram(program, 1, &ocl->device, "", NULL, NULL);
+       if (ocl_error != CL_SUCCESS)
+       {
+               char build_log[16384];
+               clGetProgramBuildInfo(program, ocl->device, CL_PROGRAM_BUILD_LOG, 16384, build_log, NULL);
+               HLTImportant("Build Log:\n\n%s\n\n", build_log);
+               quit("Error compiling program");
+       }
+       cl_kernel kernel = clCreateKernel(program, "krnlGetPtr", &ocl_error);
+       if (ocl_error != CL_SUCCESS) quit("Error creating kernel");
+       clSetKernelArg(kernel, 0, sizeof(cl_mem), &ocl->mem_gpu);
+       clSetKernelArg(kernel, 1, sizeof(cl_mem), &ocl->mem_host);
+       size_t local_size = 16, global_size = 16;
+       if (clEnqueueNDRangeKernel(ocl->command_queue[0], kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL) != CL_SUCCESS) quit("Error executing kernel");
+       clFinish(ocl->command_queue[0]);
+       clReleaseKernel(kernel);
+       clReleaseProgram(program);
+
+       if (fDebugLevel >= 2) HLTInfo("Mapping hostmemory");
+       ocl->mem_host_ptr = clEnqueueMapBuffer(ocl->command_queue[0], ocl->mem_host, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, hostMemSize, 0, NULL, NULL, &ocl_error);
+       if (ocl_error != CL_SUCCESS)
+       {
+               HLTError("Error allocating Page Locked Host Memory");
+               return(1);
+       }
+       fHostLockedMemory = ocl->mem_host_ptr;
+       if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
+       fGPUMergerHostMemory = ((char*) fHostLockedMemory) + hostMemSize - fGPUMergerMaxMemory;
+
+       if (fDebugLevel >= 2) HLTInfo("Obtained Pointer to GPU Memory: %p", *((void**) ocl->mem_host_ptr));
+       fGPUMemory = *((void**) ocl->mem_host_ptr);
+       fGPUMergerMemory = ((char*) fGPUMemory) + fGPUMemSize - fGPUMergerMaxMemory;
+
+       if (fDebugLevel >= 1)
+       {
+               memset(ocl->mem_host_ptr, 0, hostMemSize);
+       }
+
+       ocl->selector_events = new cl_event[fSliceCount];
+
+       HLTImportant("OPENCL Initialisation successfull (%d: %s %s (Frequency %d, Shaders %d) Thread %d, Max slices: %d)", bestDevice, device_vendor, device_name, (int) freq, (int) shaders, fThreadId, fSliceCount);
+
+       return(0);
+}
+
+static const char* opencl_error_string(int errorcode)
+{
+       switch (errorcode)
+       {
+               case CL_SUCCESS:                            return "Success!";
+               case CL_DEVICE_NOT_FOUND:                   return "Device not found.";
+               case CL_DEVICE_NOT_AVAILABLE:               return "Device not available";
+               case CL_COMPILER_NOT_AVAILABLE:             return "Compiler not available";
+               case CL_MEM_OBJECT_ALLOCATION_FAILURE:      return "Memory object allocation failure";
+               case CL_OUT_OF_RESOURCES:                   return "Out of resources";
+               case CL_OUT_OF_HOST_MEMORY:                 return "Out of host memory";
+               case CL_PROFILING_INFO_NOT_AVAILABLE:       return "Profiling information not available";
+               case CL_MEM_COPY_OVERLAP:                   return "Memory copy overlap";
+               case CL_IMAGE_FORMAT_MISMATCH:              return "Image format mismatch";
+               case CL_IMAGE_FORMAT_NOT_SUPPORTED:         return "Image format not supported";
+               case CL_BUILD_PROGRAM_FAILURE:              return "Program build failure";
+               case CL_MAP_FAILURE:                        return "Map failure";
+               case CL_INVALID_VALUE:                      return "Invalid value";
+               case CL_INVALID_DEVICE_TYPE:                return "Invalid device type";
+               case CL_INVALID_PLATFORM:                   return "Invalid platform";
+               case CL_INVALID_DEVICE:                     return "Invalid device";
+               case CL_INVALID_CONTEXT:                    return "Invalid context";
+               case CL_INVALID_QUEUE_PROPERTIES:           return "Invalid queue properties";
+               case CL_INVALID_COMMAND_QUEUE:              return "Invalid command queue";
+               case CL_INVALID_HOST_PTR:                   return "Invalid host pointer";
+               case CL_INVALID_MEM_OBJECT:                 return "Invalid memory object";
+               case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:    return "Invalid image format descriptor";
+               case CL_INVALID_IMAGE_SIZE:                 return "Invalid image size";
+               case CL_INVALID_SAMPLER:                    return "Invalid sampler";
+               case CL_INVALID_BINARY:                     return "Invalid binary";
+               case CL_INVALID_BUILD_OPTIONS:              return "Invalid build options";
+               case CL_INVALID_PROGRAM:                    return "Invalid program";
+               case CL_INVALID_PROGRAM_EXECUTABLE:         return "Invalid program executable";
+               case CL_INVALID_KERNEL_NAME:                return "Invalid kernel name";
+               case CL_INVALID_KERNEL_DEFINITION:          return "Invalid kernel definition";
+               case CL_INVALID_KERNEL:                     return "Invalid kernel";
+               case CL_INVALID_ARG_INDEX:                  return "Invalid argument index";
+               case CL_INVALID_ARG_VALUE:                  return "Invalid argument value";
+               case CL_INVALID_ARG_SIZE:                   return "Invalid argument size";
+               case CL_INVALID_KERNEL_ARGS:                return "Invalid kernel arguments";
+               case CL_INVALID_WORK_DIMENSION:             return "Invalid work dimension";
+               case CL_INVALID_WORK_GROUP_SIZE:            return "Invalid work group size";
+               case CL_INVALID_WORK_ITEM_SIZE:             return "Invalid work item size";
+               case CL_INVALID_GLOBAL_OFFSET:              return "Invalid global offset";
+               case CL_INVALID_EVENT_WAIT_LIST:            return "Invalid event wait list";
+               case CL_INVALID_EVENT:                      return "Invalid event";
+               case CL_INVALID_OPERATION:                  return "Invalid operation";
+               case CL_INVALID_GL_OBJECT:                  return "Invalid OpenGL object";
+               case CL_INVALID_BUFFER_SIZE:                return "Invalid buffer size";
+               case CL_INVALID_MIP_LEVEL:                  return "Invalid mip-map level";
+               default: return "Unknown Errorcode";
+       }
+}
+
+bool AliHLTTPCCAGPUTrackerOpenCL::GPUFailedMsgA(int error, const char* file, int line)
+{
+       //Check for OPENCL Error and in the case of an error display the corresponding error string
+       if (error == CL_SUCCESS) return(false);
+       HLTWarning("OCL Error: %d / %s (%s:%d)", error, opencl_error_string(error), file, line);
+       return(true);
+}
+
+int AliHLTTPCCAGPUTrackerOpenCL::GPUSync(char* state, int stream, int slice)
+{
+       //Wait for OPENCL-Kernel to finish and check for OPENCL errors afterwards
+
+       if (fDebugLevel == 0) return(0);
+       for (int i = 0;i < fSliceCount;i++)
+       {
+               if (stream != -1) i = stream;
+               clFinish(ocl->command_queue[i]);
+               if (stream != -1) break;
+       }
+       if (fDebugLevel >= 3) HLTInfo("OPENCL Sync Done");
+       return(0);
+}
+
+template <class T> static inline cl_int clSetKernelArgA(cl_kernel krnl, cl_uint num, T arg)
+{
+       return(clSetKernelArg(krnl, num, sizeof(T), &arg));
+}
+
+static inline cl_int clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent)
+{
+       return(clEnqueueNDRangeKernel(queue, krnl, 1, NULL, &global_size, &local_size, 0, NULL, pEvent));
+}
+
+int AliHLTTPCCAGPUTrackerOpenCL::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
+{
+       //Primary reconstruction function
+
+       if (Reconstruct_Base_Init(pOutput, pClusterData, firstSlice, sliceCountLocal)) return(1);
+
+       //Copy Tracker Object to GPU Memory
+       if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
+
+       GPUFailedMsg(clEnqueueWriteBuffer(ocl->command_queue[0], ocl->mem_constant, CL_FALSE, 0, sizeof(AliHLTTPCCATracker) * sliceCountLocal, fGpuTracker, 0, NULL, NULL));
+
+       if (GPUSync("Initialization (1)", 0, firstSlice) RANDOM_ERROR)
+       {
+               ResetHelperThreads(0);
+               return(1);
+       }
+
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               if (Reconstruct_Base_SliceInit(pClusterData, iSlice, firstSlice)) return(1);
+
+               //Initialize temporary memory where needed
+               if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
+               clSetKernelArgA(ocl->kernel_row_blocks, 0, ocl->mem_gpu);
+               clSetKernelArgA(ocl->kernel_row_blocks, 1, ocl->mem_constant);
+               clSetKernelArgA(ocl->kernel_row_blocks, 2, iSlice);
+               clExecuteKernelA(ocl->command_queue[2], ocl->kernel_row_blocks, HLTCA_GPU_THREAD_COUNT, HLTCA_GPU_THREAD_COUNT * fConstructorBlockCount, NULL);
+               if (GPUSync("Initialization (2)", 2, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+
+               //Copy Data to GPU Global Memory
+               GPUFailedMsg(clEnqueueWriteBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[iSlice].CommonMemory() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), 0, NULL, NULL));
+               GPUFailedMsg(clEnqueueWriteBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[iSlice].Data().Memory() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), 0, NULL, NULL));
+               GPUFailedMsg(clEnqueueWriteBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[iSlice].SliceDataRows() - (char*) fGPUMemory, (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), 0, NULL, NULL));
+
+               if (fDebugLevel >= 4)
+               {
+                       if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
+                       fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
+                       fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
+               }
+
+               if (GPUSync("Initialization (3)", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+               StandalonePerfTime(firstSlice + iSlice, 1);
+
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);
+               clSetKernelArgA(ocl->kernel_neighbours_finder, 0, ocl->mem_gpu);
+               clSetKernelArgA(ocl->kernel_neighbours_finder, 1, ocl->mem_constant);
+               clSetKernelArgA(ocl->kernel_neighbours_finder, 2, iSlice);
+               clExecuteKernelA(ocl->command_queue[iSlice & 1], ocl->kernel_neighbours_finder, HLTCA_GPU_THREAD_COUNT_FINDER, HLTCA_GPU_THREAD_COUNT_FINDER * fSlaveTrackers[firstSlice + iSlice].Param().NRows(), NULL);
+
+               if (GPUSync("Neighbours finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+
+               StandalonePerfTime(firstSlice + iSlice, 2);
+
+               if (fDebugLevel >= 4)
+               {
+                       GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].Data().Memory() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), 0, NULL, NULL));
+                       if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
+               }
+
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);
+               clSetKernelArgA(ocl->kernel_neighbours_cleaner, 0, ocl->mem_gpu);
+               clSetKernelArgA(ocl->kernel_neighbours_cleaner, 1, ocl->mem_constant);
+               clSetKernelArgA(ocl->kernel_neighbours_cleaner, 2, iSlice);
+               clExecuteKernelA(ocl->command_queue[iSlice & 1], ocl->kernel_neighbours_cleaner, HLTCA_GPU_THREAD_COUNT, HLTCA_GPU_THREAD_COUNT * (fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 2), NULL);
+               if (GPUSync("Neighbours Cleaner", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+
+               StandalonePerfTime(firstSlice + iSlice, 3);
+
+               if (fDebugLevel >= 4)
+               {
+                       GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].Data().Memory() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), 0, NULL, NULL));
+                       if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
+               }
+
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);
+               clSetKernelArgA(ocl->kernel_start_hits_finder, 0, ocl->mem_gpu);
+               clSetKernelArgA(ocl->kernel_start_hits_finder, 1, ocl->mem_constant);
+               clSetKernelArgA(ocl->kernel_start_hits_finder, 2, iSlice);
+               clExecuteKernelA(ocl->command_queue[iSlice & 1], ocl->kernel_start_hits_finder, HLTCA_GPU_THREAD_COUNT, HLTCA_GPU_THREAD_COUNT * (fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 6), NULL);
+
+               if (GPUSync("Start Hits Finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+
+               StandalonePerfTime(firstSlice + iSlice, 4);
+
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);
+               clSetKernelArgA(ocl->kernel_start_hits_sorter, 0, ocl->mem_gpu);
+               clSetKernelArgA(ocl->kernel_start_hits_sorter, 1, ocl->mem_constant);
+               clSetKernelArgA(ocl->kernel_start_hits_sorter, 2, iSlice);
+               clExecuteKernelA(ocl->command_queue[iSlice & 1], ocl->kernel_start_hits_sorter, HLTCA_GPU_THREAD_COUNT, HLTCA_GPU_THREAD_COUNT * fConstructorBlockCount, NULL);
+               if (GPUSync("Start Hits Sorter", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+
+               StandalonePerfTime(firstSlice + iSlice, 5);
+
+               if (fDebugLevel >= 2)
+               {
+                       GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].CommonMemory() - (char*) fGPUMemory, fGpuTracker[iSlice].CommonMemorySize(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), 0, NULL, NULL) RANDOM_ERROR);
+                       if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);
+                       if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS RANDOM_ERROR)
+                       {
+                               HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
+                               ResetHelperThreads(1);
+                               return(1);
+                       }
+               }
+
+               if (fDebugLevel >= 4 && *fSlaveTrackers[firstSlice + iSlice].NTracklets())
+               {
+#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
+                       GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].TrackletTmpStartHits() - (char*) fGPUMemory, pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), 0, NULL, NULL));
+                       if (fDebugMask & 8)
+                       {
+                               *fOutFile << "Temporary ";
+                               fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
+                       }
+                       uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
+                       GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].RowStartHitCountOffset() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), tmpMemory, 0, NULL, NULL));
+                       if (fDebugMask & 16)
+                       {
+                               *fOutFile << "Start Hits Sort Vector:" << std::endl;
+                               for (int i = 1;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 5;i++)
+                               {
+                                       *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
+                               }
+                       }
+                       free(tmpMemory);
+#endif
+
+                       GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].HitMemory() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), fSlaveTrackers[firstSlice + iSlice].HitMemory(), 0, NULL, NULL));
+                       if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
+               }
+
+               StandalonePerfTime(firstSlice + iSlice, 6);
+
+               fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
+       }
+
+       for (int i = 0;i < fNHelperThreads;i++)
+       {
+               pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
+       }
+
+       StandalonePerfTime(firstSlice, 7);
+
+       if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
+       for (int i = 0;i < 3;i++) clFinish(ocl->command_queue[i]);
+       clSetKernelArgA(ocl->kernel_tracklet_constructor, 0, ocl->mem_gpu);
+       clSetKernelArgA(ocl->kernel_tracklet_constructor, 1, ocl->mem_constant);
+       clExecuteKernelA(ocl->command_queue[0], ocl->kernel_tracklet_constructor, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR * fConstructorBlockCount, NULL);
+       if (GPUSync("Tracklet Constructor", 0, firstSlice) RANDOM_ERROR)
+       {
+               SynchronizeGPU();
+               return(1);
+       }
+       clFinish(ocl->command_queue[0]);
+
+       StandalonePerfTime(firstSlice, 8);
+
+       if (fDebugLevel >= 4)
+       {
+               for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+               {
+                       GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[0], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].CommonMemory() - (char*) fGPUMemory, fGpuTracker[iSlice].CommonMemorySize(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), 0, NULL, NULL));
+                       if (fDebugLevel >= 5)
+                       {
+                               HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
+                       }
+                       GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[0], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].TrackletMemory() - (char*) fGPUMemory, fGpuTracker[iSlice].TrackletMemorySize(), fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), 0, NULL, NULL));
+                       GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[0], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].HitMemory() - (char*) fGPUMemory, fGpuTracker[iSlice].HitMemorySize(), fSlaveTrackers[firstSlice + iSlice].HitMemory(), 0, NULL, NULL));
+                       if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
+               }
+       }
+
+       int runSlices = 0;
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
+       {
+               if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
+               if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
+               clSetKernelArgA(ocl->kernel_tracklet_selector, 0, ocl->mem_gpu);
+               clSetKernelArgA(ocl->kernel_tracklet_selector, 1, ocl->mem_constant);
+               clSetKernelArgA(ocl->kernel_tracklet_selector, 2, iSlice);
+               clSetKernelArgA(ocl->kernel_tracklet_selector, 3, (int) CAMath::Min(runSlices, sliceCountLocal - iSlice));
+               clExecuteKernelA(ocl->command_queue[iSlice], ocl->kernel_tracklet_selector, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR * fConstructorBlockCount, NULL);
+               if (GPUSync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       SynchronizeGPU();
+                       return(1);
+               }
+       }
+       StandalonePerfTime(firstSlice, 9);
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               clEnqueueMarkerWithWaitList(ocl->command_queue[iSlice], 0, NULL, &ocl->selector_events[iSlice]);
+       }
+
+       char *tmpMemoryGlobalTracking = NULL;
+       fSliceOutputReady = 0;
+       
+       if (Reconstruct_Base_StartGlobal(pOutput, tmpMemoryGlobalTracking)) return(1);
+
+       int tmpSlice = 0, tmpSlice2 = 0;
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
+               cl_int eventdone;
+
+               if (tmpSlice < sliceCountLocal) GPUFailedMsg(clGetEventInfo(ocl->selector_events[tmpSlice], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(eventdone), &eventdone, NULL));
+               while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || eventdone == CL_COMPLETE))
+               {
+                       clReleaseEvent(ocl->selector_events[tmpSlice]);
+                       if (GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[tmpSlice], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[tmpSlice].CommonMemory() - (char*) fGPUMemory, fGpuTracker[tmpSlice].CommonMemorySize(), fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), 0, NULL, &ocl->selector_events[tmpSlice]) RANDOM_ERROR))
+                       {
+                               HLTImportant("Error transferring tracks from GPU to host");
+                               ResetHelperThreads(1);
+                               ActivateThreadContext();
+                               return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
+                       }
+                       tmpSlice++;
+                       if (tmpSlice < sliceCountLocal) GPUFailedMsg(clGetEventInfo(ocl->selector_events[tmpSlice], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(eventdone), &eventdone, NULL));
+               }
+
+               if (tmpSlice2 < tmpSlice) GPUFailedMsg(clGetEventInfo(ocl->selector_events[tmpSlice2], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(eventdone), &eventdone, NULL));
+               while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? (clFinish(ocl->command_queue[tmpSlice2]) == CL_SUCCESS) : (eventdone == CL_COMPLETE)))
+               {
+                       if (*fSlaveTrackers[firstSlice + tmpSlice2].NTracks() > 0)
+                       {
+                               GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[tmpSlice2], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[tmpSlice2].Tracks() - (char*) fGPUMemory, sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), 0, NULL, NULL));
+                               GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[tmpSlice2], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[tmpSlice2].TrackHits() - (char*) fGPUMemory, sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), 0, NULL, NULL));
+                       }
+                       tmpSlice2++;
+                       if (tmpSlice2 < tmpSlice) GPUFailedMsg(clGetEventInfo(ocl->selector_events[tmpSlice2], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(eventdone), &eventdone, NULL));
+               }
+
+               if (GPUFailedMsg(clFinish(ocl->command_queue[iSlice])) RANDOM_ERROR)
+               {
+                       ResetHelperThreads(1);
+                       ActivateThreadContext();
+                       for (int iSlice2 = 0;iSlice2 < sliceCountLocal;iSlice2++) clReleaseEvent(ocl->selector_events[iSlice2]);
+                       return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
+               }
+
+               if (fDebugLevel >= 4)
+               {
+                       SynchronizeGPU();
+#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
+                       //GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
+                       GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[0], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].TrackletMemory() - (char*) fGPUMemory, fGpuTracker[iSlice].TrackletMemorySize(), fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), 0, NULL, NULL));
+                       if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
+#endif
+                       if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
+               }
+
+
+               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError RANDOM_ERROR)
+               {
+                       HLTError("GPU Tracker returned Error Code %d in slice %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError, firstSlice + iSlice);
+                       ResetHelperThreads(1);
+                       for (int iSlice2 = 0;iSlice2 < sliceCountLocal;iSlice2++) clReleaseEvent(ocl->selector_events[iSlice2]);
+                       return(1);
+               }
+               if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
+
+               if (Reconstruct_Base_FinishSlices(pOutput, iSlice, firstSlice)) return(1);
+       }
+       for (int iSlice2 = 0;iSlice2 < sliceCountLocal;iSlice2++) clReleaseEvent(ocl->selector_events[iSlice2]);
+
+       if (Reconstruct_Base_Finalize(pOutput, tmpMemoryGlobalTracking, firstSlice)) return(1);
+
+       return(0);
+}
+
+int AliHLTTPCCAGPUTrackerOpenCL::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
+{
+       HLTFatal("Not implemented in OpenCL (ReconstructPP)");
+       return(1);
+}
+
+int AliHLTTPCCAGPUTrackerOpenCL::ExitGPU_Runtime()
+{
+       //Uninitialize OPENCL
+
+       const int nStreams = CAMath::Max(3, fSliceCount);
+       for (int i = 0;i < nStreams;i++) clFinish(ocl->command_queue[i]);
+
+       if (fGPUMemory)
+       {
+               clReleaseMemObject(ocl->mem_gpu);
+               clReleaseMemObject(ocl->mem_constant);
+               fGPUMemory = NULL;
+
+               clReleaseKernel(ocl->kernel_neighbours_finder);
+               clReleaseKernel(ocl->kernel_neighbours_cleaner);
+               clReleaseKernel(ocl->kernel_start_hits_finder);
+               clReleaseKernel(ocl->kernel_start_hits_sorter);
+               clReleaseKernel(ocl->kernel_tracklet_constructor);
+               clReleaseKernel(ocl->kernel_tracklet_selector);
+               clReleaseKernel(ocl->kernel_row_blocks);
+       }
+       if (fHostLockedMemory)
+       {
+               clEnqueueUnmapMemObject(ocl->command_queue[0], ocl->mem_host, ocl->mem_host_ptr, 0, NULL, NULL);
+               ocl->mem_host_ptr = NULL;
+               for (int i = 0;i < nStreams;i++)
+               {
+                       clReleaseCommandQueue(ocl->command_queue[i]);
+               }
+               clReleaseMemObject(ocl->mem_host);
+               fGpuTracker = NULL;
+               fHostLockedMemory = NULL;
+       }
+
+       if (ocl->selector_events)
+       {
+               delete[] ocl->selector_events;
+               ocl->selector_events = NULL;
+       }
+       if (ocl->devices)
+       {
+               delete[] ocl->devices;
+               ocl->devices = NULL;
+       }
+
+       clReleaseProgram(ocl->program);
+       clReleaseContext(ocl->context);
+
+       HLTInfo("OPENCL Uninitialized");
+       fCudaInitialized = 0;
+       return(0);
+}
+
+int AliHLTTPCCAGPUTrackerOpenCL::RefitMergedTracks(AliHLTTPCGMMerger* Merger)
+{
+       HLTFatal("Not implemented in OpenCL (Merger)");
+       return(1);
+}
+
+void AliHLTTPCCAGPUTrackerOpenCL::ActivateThreadContext()
+{
+}
+
+void AliHLTTPCCAGPUTrackerOpenCL::ReleaseThreadContext()
+{
+}
+
+void AliHLTTPCCAGPUTrackerOpenCL::SynchronizeGPU()
+{
+       const int nStreams = CAMath::Max(3, fSliceCount);
+       for (int i = 0;i < nStreams;i++) clFinish(ocl->command_queue[i]);
+}
+
+AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
+{
+       return new AliHLTTPCCAGPUTrackerOpenCL;
+}
+
+void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)
+{
+       delete ptr;
+}
diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.h
new file mode 100644 (file)
index 0000000..88dfcb0
--- /dev/null
@@ -0,0 +1,65 @@
+//-*- Mode: C++ -*-
+// $Id$
+
+// ************************************************************************
+// This file is property of and copyright by the ALICE HLT Project        *
+// ALICE Experiment at CERN, All rights reserved.                         *
+// See cxx source for full Copyright notice                               *
+//                                                                        *
+//*************************************************************************
+
+//  @file   AliHLTTPCCAGPUTrackerOpenCL.h
+//  @author David Rohr, Sergey Gorbunov
+//  @date   
+//  @brief  TPC CA Tracker for the NVIDIA GPU
+//  @note 
+
+
+#ifndef ALIHLTTPCCAGPUTRACKEROPENCL_H
+#define ALIHLTTPCCAGPUTRACKEROPENCL_H
+
+#include "AliHLTTPCCAGPUTrackerBase.h"
+
+struct AliHLTTPCCAGPUTrackerOpenCLInternals;
+
+class AliHLTTPCCAGPUTrackerOpenCL : public AliHLTTPCCAGPUTrackerBase
+{
+public:
+       AliHLTTPCCAGPUTrackerOpenCL();
+       virtual ~AliHLTTPCCAGPUTrackerOpenCL();
+
+       virtual int InitGPU_Runtime(int sliceCount = -1, int forceDeviceID = -1);
+       virtual int Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
+       virtual int ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
+       virtual int ExitGPU_Runtime();
+       virtual int RefitMergedTracks(AliHLTTPCGMMerger* Merger);
+
+protected:
+       virtual void ActivateThreadContext();
+       virtual void ReleaseThreadContext();
+       virtual void SynchronizeGPU();
+       virtual int GPUSync(char* state = "UNKNOWN", int sliceLocal = 0, int slice = 0);
+
+private:
+       void DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check = true);
+       bool GPUFailedMsgA(int, const char* file, int line);
+       AliHLTTPCCAGPUTrackerOpenCLInternals* ocl;
+
+
+       // disable copy
+       AliHLTTPCCAGPUTrackerOpenCL( const AliHLTTPCCAGPUTrackerOpenCL& );
+       AliHLTTPCCAGPUTrackerOpenCL &operator=( const AliHLTTPCCAGPUTrackerOpenCL& );
+
+       ClassDef( AliHLTTPCCAGPUTrackerOpenCL, 0 )
+};
+
+#ifdef R__WIN32
+#define DLL_EXPORT __declspec(dllexport)
+#else
+#define DLL_EXPORT
+#endif
+
+extern "C" DLL_EXPORT AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate();
+extern "C" DLL_EXPORT void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr);
+
+#endif //ALIHLTTPCCAGPUTRACKER_H
diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCLInternals.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCLInternals.h
new file mode 100644 (file)
index 0000000..537a628
--- /dev/null
@@ -0,0 +1,40 @@
+//-*- Mode: C++ -*-
+// $Id$
+
+// ************************************************************************
+// This file is property of and copyright by the ALICE HLT Project        *
+// ALICE Experiment at CERN, All rights reserved.                         *
+// See cxx source for full Copyright notice                               *
+//                                                                        *
+//*************************************************************************
+
+//  @file   AliHLTTPCCAGPUTrackerOpenCL.h
+//  @author David Rohr, Sergey Gorbunov
+//  @date   
+//  @brief  TPC CA Tracker for the NVIDIA GPU
+//  @note 
+
+
+#ifndef ALIHLTTPCCAGPUTRACKEROPENCLINTERNALS_H
+#define ALIHLTTPCCAGPUTRACKEROPENCLINTERNALS_H
+
+#include <CL/opencl.h>
+#include <CL/cl_ext.h>
+
+struct AliHLTTPCCAGPUTrackerOpenCLInternals
+{
+       cl_device_id device;
+       cl_device_id* devices;
+       cl_context context;
+       cl_command_queue command_queue[36];
+       cl_mem mem_gpu;
+       cl_mem mem_constant;
+       cl_mem mem_host;
+       void* mem_host_ptr;
+       cl_event* selector_events;
+       cl_program program;
+
+       cl_kernel kernel_neighbours_finder, kernel_neighbours_cleaner, kernel_start_hits_finder, kernel_start_hits_sorter, kernel_tracklet_constructor, kernel_tracklet_selector, kernel_row_blocks;
+};
+
+#endif
\ No newline at end of file
index ec0a550..ab17111 100755 (executable)
@@ -1,6 +1,6 @@
 #include "AliHLTTPCCAGPUConfig.h"
 
-GPUdi() void AliHLTTPCCATrackletConstructor::CopyTrackletTempData( AliHLTTPCCAThreadMemory &rMemSrc, AliHLTTPCCAThreadMemory &rMemDst, AliHLTTPCCATrackParam &tParamSrc, AliHLTTPCCATrackParam &tParamDst)
+MEM_TEMPLATE4() GPUdi() void AliHLTTPCCATrackletConstructor::CopyTrackletTempData( MEM_TYPE(AliHLTTPCCAThreadMemory) &rMemSrc, MEM_TYPE2(AliHLTTPCCAThreadMemory) &rMemDst, MEM_TYPE3(AliHLTTPCCATrackParam) &tParamSrc, MEM_TYPE4(AliHLTTPCCATrackParam) &tParamDst)
 {
        //Copy Temporary Tracklet data from registers to global mem and vice versa
        rMemDst.fStartRow = rMemSrc.fStartRow;
@@ -48,20 +48,20 @@ GPUdi() void AliHLTTPCCATrackletConstructor::CopyTrackletTempData( AliHLTTPCCATh
 }
 
 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
-GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, int Reverse, int RowBlock, int &mustInit)
+GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker, GPUshared() MEM_LOCAL(AliHLTTPCCASharedMemory) &sMem, int Reverse, int RowBlock, int &mustInit)
 {
        //Fetch a new trackled to be processed by this thread
-       __syncthreads();
+       GPUsync();
        int nextTrackletFirstRun = sMem.fNextTrackletFirstRun;
-       if (threadIdx.x == 0)
+       if (get_local_id(0) == 0)
        {
                sMem.fNTracklets = *tracker.NTracklets();
                if (sMem.fNextTrackletFirstRun)
                {
 #ifdef HLTCA_GPU_SCHED_FIXED_START
-                       const int iSlice = tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + (gridDim.x % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % gridDim.x != 0)) / gridDim.x;
-                       const int nSliceBlockOffset = gridDim.x * iSlice / tracker.GPUParametersConst()->fGPUnSlices;
-                       const uint2 &nTracklet = tracker.BlockStartingTracklet()[blockIdx.x - nSliceBlockOffset];
+                       const int iSlice = tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + (get_num_groups(0) % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0);
+                       const int nSliceBlockOffset = get_num_groups(0) * iSlice / tracker.GPUParametersConst()->fGPUnSlices;
+                       const uint2 &nTracklet = tracker.BlockStartingTracklet()[get_group_id(0) - nSliceBlockOffset];
 
                        sMem.fNextTrackletCount = nTracklet.y;
                        if (sMem.fNextTrackletCount == 0)
@@ -84,7 +84,7 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr
                else
                {
                        const int4 oldPos = *tracker.RowBlockPos(Reverse, RowBlock);
-                       const int nFetchTracks = CAMath::Max(CAMath::Min(oldPos.x - oldPos.y, HLTCA_GPU_THREAD_COUNT), 0);
+                       const int nFetchTracks = CAMath::Max(CAMath::Min(oldPos.x - oldPos.y, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR), 0);
                        sMem.fNextTrackletCount = nFetchTracks;
                        const int nUseTrack = nFetchTracks ? CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).y, nFetchTracks) : 0;
                        sMem.fNextTrackletFirst = nUseTrack;
@@ -99,30 +99,30 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr
                                }
                                for (int i = 0;i < nFillTracks;i++)
                                {
-                                       tracker.RowBlockTracklets(Reverse, RowBlock)[(nStartFillTrack + i) % HLTCA_GPU_MAX_TRACKLETS] = -(blockIdx.x * 1000000 + nFetchTracks * 10000 + oldPos.x * 100 + oldPos.y);     //Dummy filling track
+                                       tracker.RowBlockTracklets(Reverse, RowBlock)[(nStartFillTrack + i) % HLTCA_GPU_MAX_TRACKLETS] = -(get_group_id(0) * 1000000 + nFetchTracks * 10000 + oldPos.x * 100 + oldPos.y);        //Dummy filling track
                                }
                        }
                }
        }
-       __syncthreads();
+       GPUsync();
        mustInit = 0;
        if (sMem.fNextTrackletCount == 0)
        {
                return(-2);             //No more track in this RowBlock
        }
-       else if (threadIdx.x >= sMem.fNextTrackletCount)
+       else if (get_local_id(0) >= sMem.fNextTrackletCount)
        {
                return(-1);             //No track in this RowBlock for this thread
        }
        else if (nextTrackletFirstRun)
        {
-               if (threadIdx.x == 0) sMem.fNextTrackletFirstRun = 0;
+               if (get_local_id(0) == 0) sMem.fNextTrackletFirstRun = 0;
                mustInit = 1;
-               return(sMem.fNextTrackletFirst + threadIdx.x);
+               return(sMem.fNextTrackletFirst + get_local_id(0));
        }
        else
        {
-               const int nTrackPos = sMem.fNextTrackletFirst + threadIdx.x;
+               const int nTrackPos = sMem.fNextTrackletFirst + get_local_id(0);
                mustInit = (nTrackPos < tracker.RowBlockPos(Reverse, RowBlock)->w);
                volatile int* const ptrTracklet = &tracker.RowBlockTracklets(Reverse, RowBlock)[nTrackPos % HLTCA_GPU_MAX_TRACKLETS];
                int nTracklet;
@@ -142,31 +142,31 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr
        }
 }
 
-GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(AliHLTTPCCATracker *pTracker)
+MEM_CLASS_PRE2 GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(MEM_LG2(AliHLTTPCCATracker) *pTracker, GPUsharedref() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory)& sMem)
 {
        //Main Tracklet construction function that calls the scheduled (FetchTracklet) and then Processes the tracklet (mainly UpdataTracklet) and at the end stores the tracklet.
        //Can also dispatch a tracklet to be rescheduled
 #ifdef HLTCA_GPU_EMULATION_SINGLE_TRACKLET
        pTracker[0].BlockStartingTracklet()[0].x = HLTCA_GPU_EMULATION_SINGLE_TRACKLET;
        pTracker[0].BlockStartingTracklet()[0].y = 1;
-       for (int i = 1;i < gridDim.x;i++)
+       for (int i = 1;i < get_num_groups(0);i++)
        {
                pTracker[0].BlockStartingTracklet()[i].x = pTracker[0].BlockStartingTracklet()[i].y = 0;
        }
 #endif //HLTCA_GPU_EMULATION_SINGLE_TRACKLET
 
-       GPUshared() AliHLTTPCCASharedMemory sMem;
+       //GPUshared() AliHLTTPCCASharedMemory sMem;
 
 #ifdef HLTCA_GPU_SCHED_FIXED_START
-       if (threadIdx.x == 0)
+       if (get_local_id(0) == 0)
        {
                sMem.fNextTrackletFirstRun = 1;
        }
-       __syncthreads();
+       GPUsync();
 #endif //HLTCA_GPU_SCHED_FIXED_START
 
 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-       if (threadIdx.x == 0)
+       if (get_local_id(0) == 0)
        {
                sMem.fMaxSync = 0;
        }
@@ -178,13 +178,13 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                for (volatile int iRowBlock = 0;iRowBlock < HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1;iRowBlock++)
                {
 #ifdef HLTCA_GPU_SCHED_FIXED_SLICE
-                       int iSlice = pTracker[0].GPUParametersConst()->fGPUnSlices * (blockIdx.x + (gridDim.x % pTracker[0].GPUParametersConst()->fGPUnSlices != 0 && pTracker[0].GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % gridDim.x != 0)) / gridDim.x;
+                       int iSlice = pTracker[0].GPUParametersConst()->fGPUnSlices * (get_group_id(0) + (get_num_groups(0) % pTracker[0].GPUParametersConst()->fGPUnSlices != 0 && pTracker[0].GPUParametersConst()->fGPUnSlices * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0);
 #else
                        for (int iSlice = 0;iSlice < pTracker[0].GPUParametersConst()->fGPUnSlices;iSlice++)
 #endif //HLTCA_GPU_SCHED_FIXED_SLICE
                        {
                                AliHLTTPCCATracker &tracker = pTracker[iSlice];
-                               if (blockIdx.x != 7 && sMem.fNextTrackletFirstRun && iSlice != (tracker.GPUParametersConst()->fGPUnSlices > gridDim.x ? blockIdx.x : (tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + (gridDim.x % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % gridDim.x != 0)) / gridDim.x)))
+                               if (get_group_id(0) != 7 && sMem.fNextTrackletFirstRun && iSlice != (tracker.GPUParametersConst()->fGPUnSlices > get_num_groups(0) ? get_group_id(0) : (tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + (get_num_groups(0) % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0))))
                                {
                                        continue;
                                }
@@ -196,13 +196,13 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                                while ((iTracklet = FetchTracklet(tracker, sMem, iReverse, iRowBlock, mustInit)) != -2)
                                {
 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-                                       CAMath::AtomicMax(&sMem.fMaxSync, threadSync);
-                                       __syncthreads();
-                                       threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / blockDim.x / gridDim.x);
+                                       CAMath::AtomicMaxShared(&sMem.fMaxSync, threadSync);
+                                       GPUsync();
+                                       threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / get_local_size(0) / get_num_groups(0));
 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
                                        if (!sharedRowsInitialized)
                                        {
-                                               for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
+                                               for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += get_local_size(0))
                                                {
                                                        reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker.SliceDataRows())[i];
                                                }
@@ -211,16 +211,16 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
 #ifdef HLTCA_GPU_RESCHED
                                        short2 storeToRowBlock;
                                        int storePosition = 0;
-                                       if (threadIdx.x < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
+                                       if (get_local_id(0) < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
                                        {
-                                               const int nReverse = threadIdx.x / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
-                                               const int nRowBlock = threadIdx.x % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nReverse = get_local_id(0) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nRowBlock = get_local_id(0) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
                                                sMem.fTrackletStoreCount[nReverse][nRowBlock] = 0;
                                        }
 #else
                                        mustInit = 1;
 #endif //HLTCA_GPU_RESCHED
-                                       __syncthreads();
+                                       GPUsync();
                                        AliHLTTPCCATrackParam tParam;
                                        AliHLTTPCCAThreadMemory rMem;
 
@@ -260,11 +260,11 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                                                {
 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
                                                        if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(j >= rMem.fEndRow || ( j >= rMem.fStartRow && j - rMem.fStartRow % 2 == 0)))
-                                                               pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
+                                                               pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1;
 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
                                                        if (iTracklet >= 0)
                                                        {
-                                                               UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
+                                                               UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j);
                                                                if (rMem.fNMissed > kMaxRowGap && j <= rMem.fStartRow)
                                                                {
                                                                        rMem.fGo = 0;
@@ -275,7 +275,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                                                        
                                                if (iTracklet >= 0 && (!rMem.fGo || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP))
                                                {
-                                                       StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam );
+                                                       StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam );
                                                }
                                        }
                                        else
@@ -284,11 +284,11 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                                                {
 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
                                                        if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && j >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (j - rMem.fStartRow) % 2 == 0 ))
-                                                               pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
+                                                               pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1;
 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
                                                        if (iTracklet >= 0)
                                                        {
-                                                               UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
+                                                               UpdateTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j);
                                                                //if (rMem.fNMissed > kMaxRowGap || rMem.fGo == 0) break;       //DR!!! CUDA Crashes with this enabled
                                                        }
                                                }
@@ -309,49 +309,49 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
 
                                                if (iTracklet >= 0 && !rMem.fGo)
                                                {
-                                                       StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam );
+                                                       StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam );
                                                }
                                        }
 
                                        if (rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0))
                                        {
                                                CopyTrackletTempData( rMem, rMemGlobal, tParam, tParamGlobal );
-                                               storePosition = CAMath::AtomicAdd(&sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x], 1);
+                                               storePosition = CAMath::AtomicAddShared(&sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x], 1);
                                        }
 
-                                       __syncthreads();
-                                       if (threadIdx.x < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
+                                       GPUsync();
+                                       if (get_local_id(0) < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
                                        {
-                                               const int nReverse = threadIdx.x / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
-                                               const int nRowBlock = threadIdx.x % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nReverse = get_local_id(0) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nRowBlock = get_local_id(0) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
                                                if (sMem.fTrackletStoreCount[nReverse][nRowBlock])
                                                {
                                                        sMem.fTrackletStoreCount[nReverse][nRowBlock] = CAMath::AtomicAdd(&tracker.RowBlockPos(nReverse, nRowBlock)->x, sMem.fTrackletStoreCount[nReverse][nRowBlock]);
                                                }
                                        }
-                                       __syncthreads();
+                                       GPUsync();
                                        if (iTracklet >= 0 && rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0))
                                        {
                                                tracker.RowBlockTracklets(storeToRowBlock.y, storeToRowBlock.x)[sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x] + storePosition] = iTracklet;
                                        }
-                                       __syncthreads();
+                                       GPUsync();
 #else
-                                       if (threadIdx.x % HLTCA_GPU_WARP_SIZE == 0)
+                                       if (get_local_id(0) % HLTCA_GPU_WARP_SIZE == 0)
                                        {
-                                               sMem.fStartRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 160;
-                                               sMem.fEndRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 0;
+                                               sMem.fStartRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 160;
+                                               sMem.fEndRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 0;
                                        }
-                                       __syncthreads();
+                                       GPUsync();
                                        if (iTracklet >= 0)
                                        {
-                                               CAMath::AtomicMin(&sMem.fStartRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fStartRow);
+                                               CAMath::AtomicMinShared(&sMem.fStartRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fStartRow);
                                        }
-                                       __syncthreads();
+                                       GPUsync();
                                        if (iTracklet >= 0)
                                        {
-                                               for (int j = sMem.fStartRows[threadIdx.x / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++)
+                                               for (int j = sMem.fStartRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++)
                                                {
-                                                       UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
+                                                       UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j);
                                                        if (!rMem.fGo) break;
                                                }
 
@@ -361,19 +361,19 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                                                {
                                                        if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) )  rMem.fGo = 0;
                                                }
-                                               CAMath::AtomicMax(&sMem.fEndRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fEndRow);
+                                               CAMath::AtomicMaxShared(&sMem.fEndRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fEndRow);
                                        }
 
-                                       __syncthreads();
+                                       GPUsync();
                                        if (iTracklet >= 0)
                                        {
                                                for (int j = rMem.fEndRow;j >= 0;j--)
                                                {
                                                        if (!rMem.fGo) break;
-                                                       UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
+                                                       UpdateTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j);
                                                }
 
-                                               StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam );
+                                               StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam );
                                        }
 #endif //HLTCA_GPU_RESCHED
                                }
@@ -424,33 +424,33 @@ GPUg() void AliHLTTPCCATrackletConstructorInit(int iSlice)
 {
        //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit
        AliHLTTPCCATracker &tracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[iSlice];
-       int i = blockIdx.x * blockDim.x + threadIdx.x;
+       int i = get_global_id(0);
        if (i >= *tracker.NTracklets()) return;
        AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit(i, tracker);
 }
 
 #elif defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER_SIMPLE)
 
-GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, AliHLTTPCCAThreadMemory& /*rMem*/, AliHLTTPCCATrackParam& /*tParam*/)
+GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker, GPUsharedref() MEM_LOCAL(AliHLTTPCCASharedMemory) &sMem, AliHLTTPCCAThreadMemory& /*rMem*/, MEM_PLAIN(AliHLTTPCCATrackParam)& /*tParam*/)
 {
-       const int nativeslice = blockIdx.x % tracker.GPUParametersConst()->fGPUnSlices;
+       const int nativeslice = get_group_id(0) % tracker.GPUParametersConst()->fGPUnSlices;
        const int nTracklets = *tracker.NTracklets();
-       __syncthreads();
+       GPUsync();
        if (sMem.fNextTrackletFirstRun == 1)
        {
-               if (threadIdx.x == 0)
+               if (get_local_id(0) == 0)
                {
-                       sMem.fNextTrackletFirst = (blockIdx.x - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT;
+                       sMem.fNextTrackletFirst = (get_group_id(0) - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;
                        sMem.fNextTrackletFirstRun = 0;
                }
        }
        else
        {
-               if (threadIdx.x == 0)
+               if (get_local_id(0) == 0)
                {
                        if (tracker.GPUParameters()->fNextTracklet < nTracklets)
                        {
-                               const int firstTracklet = CAMath::AtomicAdd(&tracker.GPUParameters()->fNextTracklet, HLTCA_GPU_THREAD_COUNT);
+                               const int firstTracklet = CAMath::AtomicAdd(&tracker.GPUParameters()->fNextTracklet, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR);
                                if (firstTracklet < nTracklets) sMem.fNextTrackletFirst = firstTracklet;
                                else sMem.fNextTrackletFirst = -2;
                        }
@@ -460,28 +460,27 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr
                        }
                }
        }
-       __syncthreads();
+       GPUsync();
        return (sMem.fNextTrackletFirst);
 }
 
-GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(AliHLTTPCCATracker *pTracker)
+GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) *pTracker, GPUsharedref() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory)& sMem)
 {
        const int nSlices = pTracker[0].GPUParametersConst()->fGPUnSlices;
-       const int nativeslice = blockIdx.x % nSlices;
-       GPUshared() AliHLTTPCCASharedMemory sMem;
+       const int nativeslice = get_group_id(0) % nSlices;
        int currentSlice = -1;
 
-       if (threadIdx.x)
+       if (get_local_id(0))
        {
                sMem.fNextTrackletFirstRun = 1;
        }
 
        for (int iSlice = 0;iSlice < nSlices;iSlice++)
        {
-               AliHLTTPCCATracker &tracker = pTracker[(nativeslice + iSlice) % nSlices];
+               GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker = pTracker[(nativeslice + iSlice) % nSlices];
                int iRow, iRowEnd;
 
-               AliHLTTPCCATrackParam tParam;
+               MEM_PLAIN(AliHLTTPCCATrackParam) tParam;
                AliHLTTPCCAThreadMemory rMem;
 
                int tmpTracklet;
@@ -489,7 +488,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                {
                        if (tmpTracklet >= 0)
                        {
-                               rMem.fItr = tmpTracklet + threadIdx.x;
+                               rMem.fItr = tmpTracklet + get_local_id(0);
                        }
                        else
                        {
@@ -498,17 +497,17 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
 
                        if (iSlice != currentSlice)
                        {
-                               if (threadIdx.x == 0)
+                               if (get_local_id(0) == 0)
                                {
                                        sMem.fNTracklets = *tracker.NTracklets();
                                }
 
-                               for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
+                               for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(MEM_PLAIN(AliHLTTPCCARow)) / sizeof(int);i += get_local_size(0))
                                {
-                                       reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker.SliceDataRows())[i];
+                                       reinterpret_cast<GPUsharedref() int*>(&sMem.fRows)[i] = reinterpret_cast<GPUglobalref() int*>(tracker.SliceDataRows())[i];
                                }
                                currentSlice = iSlice;
-                               __syncthreads();
+                               GPUsync();
                        }
 
                        if (rMem.fItr < sMem.fNTracklets)
@@ -567,41 +566,41 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
 
 #else //HLTCA_GPU_ALTERNATIVE_SCHEDULER
 
-GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, AliHLTTPCCAThreadMemory &rMem, AliHLTTPCCATrackParam &tParam)
+GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker, GPUsharedref() MEM_LOCAL(AliHLTTPCCASharedMemory) &sMem, AliHLTTPCCAThreadMemory &rMem, MEM_PLAIN(AliHLTTPCCATrackParam) &tParam)
 {
-       const int nativeslice = blockIdx.x % tracker.GPUParametersConst()->fGPUnSlices;
+       const int nativeslice = get_group_id(0) % tracker.GPUParametersConst()->fGPUnSlices;
        const int nTracklets = *tracker.NTracklets();
-       __syncthreads();
-       if (threadIdx.x == 0) sMem.fTrackletStorePos = 0;
+       GPUsync();
+       if (get_local_id(0) == 0) sMem.fTrackletStorePos = 0;
        int nStorePos = -1;
        if (sMem.fNextTrackletFirstRun == 1)
        {
-               if (threadIdx.x == 0)
+               if (get_local_id(0) == 0)
                {
-                       sMem.fNextTrackletFirst = (blockIdx.x - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT;
+                       sMem.fNextTrackletFirst = (get_group_id(0) - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;
                        sMem.fNextTrackletFirstRun = 0;
-                       sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT;
+                       sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;
                }
        }
        else
        {
-               if (sMem.fNextTrackletCount < HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_ALTSCHED_MIN_THREADS)
+               if (sMem.fNextTrackletCount < HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR - HLTCA_GPU_ALTSCHED_MIN_THREADS)
                {
-                       if (threadIdx.x == 0)
+                       if (get_local_id(0) == 0)
                        {
                                sMem.fNextTrackletFirst = -1;
                        }
                }
                else
                {
-                       __syncthreads();
+                       GPUsync();
                        if (rMem.fItr != -1)
                        {
-                               nStorePos = CAMath::AtomicAdd(&sMem.fTrackletStorePos, 1);
+                               nStorePos = CAMath::AtomicAddShared(&sMem.fTrackletStorePos, 1);
                                CopyTrackletTempData(rMem, sMem.swapMemory[nStorePos].fThreadMem, tParam, sMem.swapMemory[nStorePos].fParam);
                                rMem.fItr = -1;
                        }
-                       if (threadIdx.x == 0)
+                       if (get_local_id(0) == 0)
                        {
                                if (tracker.GPUParameters()->fNextTracklet >= nTracklets)
                                {
@@ -623,12 +622,12 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr
                }
        }
 
-       if (threadIdx.x == 0)
+       if (get_local_id(0) == 0)
        {
-               if (sMem.fNextTrackletFirst == -1 && sMem.fNextTrackletCount == HLTCA_GPU_THREAD_COUNT)
+               if (sMem.fNextTrackletFirst == -1 && sMem.fNextTrackletCount == HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR)
                {
                        sMem.fNextTrackletFirst = -2;
-                       sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT;
+                       sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;
                }
                else if (sMem.fNextTrackletFirst >= 0)
                {
@@ -642,28 +641,28 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr
                        }
                }
        }
-       __syncthreads();
-       if (threadIdx.x < sMem.fTrackletStorePos)
+       GPUsync();
+       if (get_local_id(0) < sMem.fTrackletStorePos)
        {
-               CopyTrackletTempData(sMem.swapMemory[threadIdx.x].fThreadMem, rMem, sMem.swapMemory[threadIdx.x].fParam, tParam);
+               CopyTrackletTempData(sMem.swapMemory[get_local_id(0)].fThreadMem, rMem, sMem.swapMemory[get_local_id(0)].fParam, tParam);
        }
        return (sMem.fNextTrackletFirst);
 }
 
-GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(AliHLTTPCCATracker *pTracker)
+GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) *pTracker, GPUsharedref() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory)& sMem)
 {
        const int nSlices = pTracker[0].GPUParametersConst()->fGPUnSlices;
-       const int nativeslice = blockIdx.x % nSlices;
-       GPUshared() AliHLTTPCCASharedMemory sMem;
+       const int nativeslice = get_group_id(0) % nSlices;
+       //GPUshared() AliHLTTPCCASharedMemory sMem;
        int currentSlice = -1;
 
-       if (threadIdx.x)
+       if (get_local_id(0))
        {
                sMem.fNextTrackletFirstRun = 1;
        }
 
 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-       if (threadIdx.x == 0)
+       if (get_local_id(0) == 0)
        {
                sMem.fMaxSync = 0;
        }
@@ -672,9 +671,9 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
 
        for (int iSlice = 0;iSlice < nSlices;iSlice++)
        {
-               AliHLTTPCCATracker &tracker = pTracker[(nativeslice + iSlice) % nSlices];
+               GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker = pTracker[(nativeslice + iSlice) % nSlices];
 
-               AliHLTTPCCATrackParam tParam;
+               MEM_PLAIN(AliHLTTPCCATrackParam) tParam;
                AliHLTTPCCAThreadMemory rMem;
                rMem.fItr = -1;
 
@@ -683,26 +682,26 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                {
 
 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-                                       CAMath::AtomicMax(&sMem.fMaxSync, threadSync);
-                                       __syncthreads();
-                                       threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / blockDim.x / gridDim.x);
+                                       CAMath::AtomicMaxShared(&sMem.fMaxSync, threadSync);
+                                       GPUsync();
+                                       threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / get_local_size(0) / get_num_groups(0));
 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
 
                        if (iSlice != currentSlice)
                        {
-                               if (threadIdx.x == 0) sMem.fNTracklets = *tracker.NTracklets();
+                               if (get_local_id(0) == 0) sMem.fNTracklets = *tracker.NTracklets();
 
-                               for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
+                               for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(MEM_PLAIN(AliHLTTPCCARow)) / sizeof(int);i += get_local_size(0))
                                {
-                                       reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker.SliceDataRows())[i];
+                                       reinterpret_cast<GPUsharedref() int*>(&sMem.fRows)[i] = reinterpret_cast<GPUglobalref() int*>(tracker.SliceDataRows())[i];
                                }
                                currentSlice = iSlice;
-                               __syncthreads();
+                               GPUsync();
                        }
 
                        if (tmpTracklet >= 0 && rMem.fItr < 0)
                        {
-                               rMem.fItr = tmpTracklet + (signed) threadIdx.x - sMem.fTrackletStorePos;
+                               rMem.fItr = tmpTracklet + (signed) get_local_id(0) - sMem.fTrackletStorePos;
                                if (rMem.fItr >= sMem.fNTracklets)
                                {
                                        rMem.fItr = -1;
@@ -734,15 +733,15 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                                        if (rMem.fStage == 2)
                                        {
                                                if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(rMem.fIRow >= rMem.fEndRow || ( rMem.fIRow >= rMem.fStartRow && rMem.fIRow - rMem.fStartRow % 2 == 0)))
-                                                       pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
+                                                       pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1;
                                        }
                                        else
                                        {
                                                if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && rMem.fIRow >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (rMem.fIRow - rMem.fStartRow) % 2 == 0 ))
-                                                       pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
+                                                       pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1;
                                        }
 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-                                       UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, rMem.fIRow);
+                                       UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, rMem.fIRow);
                                }
 
                                if (rMem.fIRow == rMem.fIRowEnd || rMem.fNMissed > kMaxRowGap)
@@ -763,9 +762,9 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
 
                                if (!rMem.fGo)
                                {
-                                       StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam );
+                                       StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam );
                                        rMem.fItr = -1;
-                                       CAMath::AtomicAdd(&sMem.fNextTrackletCount, 1);
+                                       CAMath::AtomicAddShared(&sMem.fNextTrackletCount, 1);
                                }
                        }
                }
@@ -774,17 +773,19 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
 
 #endif //HLTCA_GPU_ALTERNATIVE_SCHEDULER
 
+#ifndef __OPENCL__
 GPUg() void AliHLTTPCCATrackletConstructorGPU()
 {
        //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU
        AliHLTTPCCATracker *pTracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker );
-       AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(pTracker);
+       GPUshared() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory) sMem;
+       AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(pTracker, sMem);
 }
 
 GPUg() void AliHLTTPCCATrackletConstructorGPUPP(int firstSlice, int sliceCount)
 {
-       if (blockIdx.x >= sliceCount) return;
-       AliHLTTPCCATracker *pTracker = &( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[firstSlice + blockIdx.x];
+       if (get_group_id(0) >= sliceCount) return;
+       AliHLTTPCCATracker *pTracker = &( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[firstSlice + get_group_id(0)];
        AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP(pTracker);
 }
 
@@ -795,17 +796,17 @@ GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP(
 #define startRows sMem.fStartRows
 #define endRows sMem.fEndRows
 #else
-       GPUshared() int startRows[HLTCA_GPU_THREAD_COUNT / HLTCA_GPU_WARP_SIZE + 1];
-       GPUshared() int endRows[HLTCA_GPU_THREAD_COUNT / HLTCA_GPU_WARP_SIZE + 1];
+       GPUshared() int startRows[HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / HLTCA_GPU_WARP_SIZE + 1];
+       GPUshared() int endRows[HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / HLTCA_GPU_WARP_SIZE + 1];
 #endif
        sMem.fNTracklets = *tracker->NTracklets();
 
-       for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
+       for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += get_local_size(0))
        {
                reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker->SliceDataRows())[i];
        }
 
-       for (int iTracklet = threadIdx.x;iTracklet < (*tracker->NTracklets() / HLTCA_GPU_THREAD_COUNT + 1) * HLTCA_GPU_THREAD_COUNT;iTracklet += blockDim.x)
+       for (int iTracklet = get_local_id(0);iTracklet < (*tracker->NTracklets() / HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR + 1) * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;iTracklet += get_local_size(0))
        {
                AliHLTTPCCATrackParam tParam;
                AliHLTTPCCAThreadMemory rMem;
@@ -826,22 +827,22 @@ GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP(
                        rMem.fGo = 1;
                }
 
-               if (threadIdx.x % HLTCA_GPU_WARP_SIZE == 0)
+               if (get_local_id(0) % HLTCA_GPU_WARP_SIZE == 0)
                {
-                       startRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 160;
-                       endRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 0;
+                       startRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 160;
+                       endRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 0;
                }
-               __syncthreads();
+               GPUsync();
                if (iTracklet < *tracker->NTracklets())
                {
-                       CAMath::AtomicMin(&startRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fStartRow);
+                       CAMath::AtomicMinShared(&startRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fStartRow);
                }
-               __syncthreads();
+               GPUsync();
                if (iTracklet < *tracker->NTracklets())
                {
-                       for (int j = startRows[threadIdx.x / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++)
+                       for (int j = startRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++)
                        {
-                               UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, *tracker, tParam, j);
+                               UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, *tracker, tParam, j);
                                if (!rMem.fGo) break;
                        }
 
@@ -851,18 +852,20 @@ GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP(
                        {
                                if ( !tParam.TransportToX( tracker->Row( rMem.fEndRow ).X(), tracker->Param().ConstBz(), .999 ) )  rMem.fGo = 0;
                        }
-                       CAMath::AtomicMax(&endRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fEndRow);
+                       CAMath::AtomicMaxShared(&endRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fEndRow);
                }
 
-               __syncthreads();
+               GPUsync();
                if (iTracklet < *tracker->NTracklets())
                {
                        for (int j = rMem.fEndRow;j >= 0;j--)
                        {
                                if (!rMem.fGo) break;
-                               UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, *tracker, tParam, j);
+                               UpdateTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, *tracker, tParam, j);
                        }
-                       StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, *tracker, tParam );
+                       StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, *tracker, tParam );
                }
        }
-}
\ No newline at end of file
+}
+
+#endif
index 447942a..3d21e02 100755 (executable)
@@ -1,36 +1,49 @@
-all:                                                                   libAliHLTTPCCAGPU.so
+all:                                                                   libAliHLTTPCCAGPU.so libAliHLTTPCCAGPUOpenCL.so
 
 clean:
-                                                                               rm -f libAliHLTTPCCAGPU.so AliHLTTPCCAGPUTrackerNVCC.o G__AliHLTTPCCAGPU.o AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx AliHLTTPCCAGPUTrackerNVCC.cu.cxx G__AliHLTTPCCAGPUAutoLinkDef.h G__AliHLTTPCCAGPU.h G__AliHLTTPCCAGPU.cxx
+                                                                               rm -f libAliHLTTPCCAGPU*.so AliHLTTPCCAGPUTracker*.o G__AliHLTTPCCAGPU*.o AliHLTTPCCAGPUTrackerNVCC.cu.cxx G__AliHLTTPCCAGPUAutoLinkDef*.h G__AliHLTTPCCAGPU*.h G__AliHLTTPCCAGPU*.cxx makefiles/opencl_compiler AliHLTTPCCAGPUTrackerOpenCLCode.*
 
+libAliHLTTPCCAGPU.so:                                  AliHLTTPCCAGPUTrackerNVCC.o AliHLTTPCCAGPUTrackerBase.o G__AliHLTTPCCAGPUNVCC.o
+                                                                               c++ -shared -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L${ALICE_ROOT}/lib/tgt_${ALICE_TARGET} -L. -lcuda -lcudart -lAliHLTTPC -o $@ $^
 
-libAliHLTTPCCAGPU.so:                                  AliHLTTPCCAGPUTrackerNVCC.o G__AliHLTTPCCAGPU.o
-                                                                               c++ -shared AliHLTTPCCAGPUTrackerNVCC.o G__AliHLTTPCCAGPU.o -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L${ALICE_ROOT}/lib/tgt_${ALICE_TARGET} -L. -lcuda -lcudart -lAliHLTTPC -o libAliHLTTPCCAGPU.so 
-
+libAliHLTTPCCAGPUOpenCL.so:                            AliHLTTPCCAGPUTrackerOpenCL.o AliHLTTPCCAGPUTrackerBase.o G__AliHLTTPCCAGPUOpenCL.o AliHLTTPCCAGPUTrackerOpenCLCode.o
+                                                                               c++ -shared -L$(AMDAPPSDKROOT)/lib/x86_64 -L${ALICE_ROOT}/lib/tgt_${ALICE_TARGET} -L. -lOpenCL -lAliHLTTPC -o $@ $^
 
 AliHLTTPCCAGPUTrackerNVCC.o:                   AliHLTTPCCAGPUTrackerNVCC.cu.cxx
-                                                                               c++ -fPIC -DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -W -Weffc++ -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -c AliHLTTPCCAGPUTrackerNVCC.cu.cxx -o AliHLTTPCCAGPUTrackerNVCC.o
+                                                                               c++ -fPIC -DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -W -Wall -Wshadow -Wno-effc++ -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -c AliHLTTPCCAGPUTrackerNVCC.cu.cxx -o AliHLTTPCCAGPUTrackerNVCC.o
+
+AliHLTTPCCAGPUTrackerBase.o:                   AliHLTTPCCAGPUTrackerBase.cxx
+                                                                               c++ -fPIC -DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -W -Weffc++ -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ROOTSYS}/include -c $< -o $@
+
+AliHLTTPCCAGPUTrackerOpenCL.o:                 AliHLTTPCCAGPUTrackerOpenCL.cxx
+                                                                               c++ -fPIC -DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -W -Weffc++ -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -I$(AMDAPPSDKROOT)/include -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ROOTSYS}/include -Imakefiles -Wno-write-strings -c $< -o $@
+
+
+G__AliHLTTPCCAGPU%.cxx:                                        AliHLTTPCCAGPUTracker%.h G__AliHLTTPCCAGPUAutoLinkDef%.h
+                                                                               rootcint -f $@ -c -Duse_aliroot -Duse_root -DROWHOUGHPARAMS -Duse_reconstruction -Duse_newio -DROOTVERSION=\"unchecked\" -DALIROOTVERSION=\"unchecked\" -D__ROOT__ -DUSE_ALILOG -DLINUX -DNDEBUG -D_MODULE_=\"HLT\" -D`uname` -DDATE_SYS=`uname` -Dlong32='int' -Dlong64='long long' -DdatePointer='long' -I${ROOTSYS}/include -pthread -m64 -DWITHXML -DWITHXML -DUSE_ROOT -DWITHXML -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/BASE/util -I${ALICE_ROOT}/HLT -I${ALICE_ROOT}/HLT/TPCLib -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca $^
 
-G__AliHLTTPCCAGPU.cxx:                                 G__AliHLTTPCCAGPUAtoLinkDef.h
-                                                                               rootcint -f G__AliHLTTPCCAGPU.cxx -c -Duse_aliroot -Duse_root -DROWHOUGHPARAMS -Duse_reconstruction -Duse_newio -DROOTVERSION=\"unchecked\" -DALIROOTVERSION=\"unchecked\" -D__ROOT__ -DUSE_ALILOG -DLINUX -DNDEBUG -D_MODULE_=\"HLT\" -D`uname` -DDATE_SYS=`uname` -Dlong32='int' -Dlong64='long long' -DdatePointer='long' -I${ROOTSYS}/include -pthread -m64 -DWITHXML -DWITHXML -DUSE_ROOT -DWITHXML -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/BASE/util -I${ALICE_ROOT}/HLT -I${ALICE_ROOT}/HLT/TPCLib -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca AliHLTTPCCAGPUTrackerNVCC.h G__AliHLTTPCCAGPUAutoLinkDef.h
+G__AliHLTTPCCAGPUAutoLinkDef%.h:               AliHLTTPCCAGPUTracker%.h
+                                                                               echo '//automatically generated ROOT DICT definition' > $@
+                                                                               echo '//!!! DO NOT EDIT THIS FILE !!!' >> $@
+                                                                               echo '#ifdef __CINT__' >> $@
+                                                                               echo '#pragma link off all globals;' >> $@
+                                                                               echo '#pragma link off all classes;' >> $@
+                                                                               echo '#pragma link off all functions;' >> $@
+                                                                               echo "#pragma link C++ class $<+;" | sed "s/\.h//" >> $@
+                                                                               echo "#pragma link C++ class AliHLTTPCCAGPUTrackerBase+;" >> $@
+                                                                               echo '#endif' >> $@
 
-G__AliHLTTPCCAGPUAtoLinkDef.h:                 AliHLTTPCCAGPUTrackerNVCC.h AliHLTTPCCAGPUTrackerNVCC.cu
-                                                                               echo '//automatically generated ROOT DICT definition' > G__AliHLTTPCCAGPUAutoLinkDef.h
-                                                                               echo '//!!! DO NOT EDIT THIS FILE !!!' >> G__AliHLTTPCCAGPUAutoLinkDef.h
-                                                                               echo '#ifdef __CINT__' >> G__AliHLTTPCCAGPUAutoLinkDef.h
-                                                                               echo '#pragma link off all globals;' >> G__AliHLTTPCCAGPUAutoLinkDef.h
-                                                                               echo '#pragma link off all classes;' >> G__AliHLTTPCCAGPUAutoLinkDef.h
-                                                                               echo '#pragma link off all functions;' >> G__AliHLTTPCCAGPUAutoLinkDef.h
-                                                                               echo "#pragma link C++ class AliHLTTPCCAGPUTrackerNVCC+;" >> G__AliHLTTPCCAGPUAutoLinkDef.h
-                                                                               echo '#endif' >> G__AliHLTTPCCAGPUAutoLinkDef.h
+G__AliHLTTPCCAGPU%.o:                                  G__AliHLTTPCCAGPU%.cxx
+                                                                               g++ -DcudaError_t=int -Duse_aliroot -Duse_root -DROWHOUGHPARAMS -Duse_reconstruction -Duse_newio -DROOTVERSION=\"unchecked\" -DALIROOTVERSION=\"unchecked\" -D__ROOT__ -DUSE_ALILOG -DLINUX -DNDEBUG -DBUILD_GPU -D_MODULE_=\"HLT\" -I${ALICE_ROOT}/HLT/TPCLib -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ALICE_ROOT}/HLT/BASE -c $< -o $@ -O -g -W -Wall -Weffc++ -fPIC -pipe -fmessage-length=0 -Wno-long-long -ansi -Dlinux -D`uname` -DDATE_SYS=`uname` -Dlong32='int' -Dlong64='long long' -DdatePointer='long' -I${ROOTSYS}/include -pthread -m64 -D__PHOSUTIL__ -D__EMCALUTIL__
 
-G__AliHLTTPCCAGPU.o:                                   G__AliHLTTPCCAGPU.cxx
-                                                                               g++ -DcudaError_t=int -Duse_aliroot -Duse_root -DROWHOUGHPARAMS -Duse_reconstruction -Duse_newio -DROOTVERSION=\"unchecked\" -DALIROOTVERSION=\"unchecked\" -D__ROOT__ -DUSE_ALILOG -DLINUX -DNDEBUG -DBUILD_GPU -D_MODULE_=\"HLT\" -I${ALICE_ROOT}/HLT/TPCLib -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ALICE_ROOT}/HLT/BASE -c G__AliHLTTPCCAGPU.cxx -o G__AliHLTTPCCAGPU.o -O -g -W -Wall -Weffc++ -fPIC -pipe -fmessage-length=0 -Wno-long-long -ansi -Dlinux -D`uname` -DDATE_SYS=`uname` -Dlong32='int' -Dlong64='long long' -DdatePointer='long' -I${ROOTSYS}/include -pthread -m64 -D__PHOSUTIL__ -D__EMCALUTIL__
+AliHLTTPCCAGPUTrackerNVCC.cu.cxx:              AliHLTTPCCAGPUTrackerNVCC.cu
+                                                                               nvcc --cuda --use_fast_math --maxrregcount 64 -O4 -Xptxas -v -Xptxas -O4 -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 --compiler-options "-DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -IRCU -W -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ROOTSYS}/include" -I. $< --output-file $@
 
-AliHLTTPCCAGPUTrackerNVCC.cu.cxx:              AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx
-                                                                               cat AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx | grep -v "^#" > AliHLTTPCCAGPUTrackerNVCC.cu.cxx
-                                                                               -patch -r /dev/null -s --no-backup-if-mismatch -i AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch AliHLTTPCCAGPUTrackerNVCC.cu.cxx
+AliHLTTPCCAGPUTrackerOpenCLCode.o:             AliHLTTPCCAGPUTrackerOpenCLCode.bin
+                                                                               gcc -c makefiles/include.S -o $@
 
-AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx:  AliHLTTPCCAGPUTrackerNVCC.cu
-                                                                               nvcc --cuda --use_fast_math --maxrregcount 64 -O4 -Xptxas -v -Xptxas -O4 -gencode arch=compute_20,code=sm_20 --compiler-options "-DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -IRCU -W -Weffc++ -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ROOTSYS}/include" -I. AliHLTTPCCAGPUTrackerNVCC.cu --output-file AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx
+AliHLTTPCCAGPUTrackerOpenCLCode.bin:   AliHLTTPCCAGPUTrackerOpenCL.cl makefiles/opencl_compiler
+                                                                               makefiles/opencl_compiler -output-file $@ AliHLTTPCCAGPUTrackerOpenCL.cl -- -I. -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ROOTSYS}/include -x clc++
 
+makefiles/opencl_compiler:                             makefiles/makefile_opencl_compiler.cpp
+                                                                               c++ $< -o $@ -I$(AMDAPPSDKROOT)/include -L$(AMDAPPSDKROOT)/lib/x86_64 -lOpenCL
diff --git a/HLT/TPCLib/tracking-ca/cagpu/makefiles/include.S b/HLT/TPCLib/tracking-ca/cagpu/makefiles/include.S
new file mode 100644 (file)
index 0000000..5b4e029
--- /dev/null
@@ -0,0 +1,8 @@
+    .global _makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl
+    .global _makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl_size
+    .section .rodata
+_makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl:
+    .incbin "AliHLTTPCCAGPUTrackerOpenCLCode.bin"
+1:
+_makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl_size:
+    .int 1b - _makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl
diff --git a/HLT/TPCLib/tracking-ca/cagpu/makefiles/makefile_opencl_compiler.cpp b/HLT/TPCLib/tracking-ca/cagpu/makefiles/makefile_opencl_compiler.cpp
new file mode 100644 (file)
index 0000000..fa2104e
--- /dev/null
@@ -0,0 +1,232 @@
+#define _CRT_SECURE_NO_WARNINGS
+#include "CL/opencl.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "opencl_compiler_structs.h"
+
+#define quit(arg) {fprintf(stderr, arg "\n");return(1);}
+#define DEFAULT_OPENCL_COMPILER_OPTIONS ""
+#define DEFAULT_OUTPUT_FILE "opencl.out"
+
+int main(int argc, char** argv)
+{
+       const char* output_file = DEFAULT_OUTPUT_FILE;
+       std::string compiler_options = DEFAULT_OPENCL_COMPILER_OPTIONS;
+       std::vector<char*> files;
+
+       printf("Passing command line options:\n");
+       bool add_option = false;
+       for (int i = 1;i < argc;i++)
+       {
+               if (add_option)
+               {
+                       compiler_options += " ";
+                       compiler_options += argv[i];
+               }
+               else if (strcmp(argv[i], "--") == 0)
+               {
+                       add_option = true;
+               }
+               else if (strcmp(argv[i], "-output-file") == 0)
+               {
+                       if (++i >= argc) quit("Output file name missing");
+                       output_file = argv[i];
+               }
+               else
+               {
+                       fprintf(stderr, "%s\n", argv[i]);
+                       files.push_back(argv[i]);
+               }
+       }
+       
+       cl_int ocl_error;
+       cl_uint num_platforms;
+       if (clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS) quit("Error getting OpenCL Platform Count");
+       if (num_platforms == 0) quit("No OpenCL Platform found");
+       printf("%d OpenCL Platforms found\n", num_platforms);
+       
+       //Query platforms
+       cl_platform_id* platforms = new cl_platform_id[num_platforms];
+       if (platforms == NULL) quit("Memory allocation error");
+       if (clGetPlatformIDs(num_platforms, platforms, NULL) != CL_SUCCESS) quit("Error getting OpenCL Platforms");
+
+       cl_platform_id platform;
+       bool found = false;
+
+       _makefiles_opencl_platform_info pinfo;
+       for (unsigned int i_platform = 0;i_platform < num_platforms;i_platform++)
+       {
+               clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_PROFILE, 64, pinfo.platform_profile, NULL);
+               clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VERSION, 64, pinfo.platform_version, NULL);
+               clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_NAME, 64, pinfo.platform_name, NULL);
+               clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VENDOR, 64, pinfo.platform_vendor, NULL);
+               printf("Available Platform %d: (%s %s) %s %s\n", i_platform, pinfo.platform_profile, pinfo.platform_version, pinfo.platform_vendor, pinfo.platform_name);
+               if (strcmp(pinfo.platform_vendor, "Advanced Micro Devices, Inc.") == 0)
+               {
+                       found = true;
+                       printf("AMD OpenCL Platform found\n");
+                       platform = platforms[i_platform];
+                       break;
+               }
+       }
+       if (found == false)
+       {
+               quit("Did not find AMD OpenCL Platform");
+       }
+
+       if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &pinfo.count) != CL_SUCCESS)
+       {
+               quit("Error getting OPENCL Device Count");
+       }
+
+       //Query devices
+       cl_device_id* devices = new cl_device_id[pinfo.count];
+       if (devices == NULL) quit("Memory allocation error");
+       if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, pinfo.count, devices, NULL) != CL_SUCCESS) quit("Error getting OpenCL devices"); 
+
+       _makefiles_opencl_device_info dinfo;
+       cl_device_type device_type;
+       cl_uint freq, shaders;
+
+       printf("Available OPENCL devices:\n");
+       for (unsigned int i = 0;i < pinfo.count;i++)
+       {
+               printf("Examining device %d\n", i);
+
+               clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, dinfo.device_name, NULL);
+               clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, dinfo.device_vendor, NULL);
+               clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL);
+               clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL);
+               clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(shaders), &shaders, NULL);
+               clGetDeviceInfo(devices[i], CL_DEVICE_ADDRESS_BITS, sizeof(dinfo.nbits), &dinfo.nbits, NULL);
+               printf("Found Device %d: %s %s (Frequency %d, Shaders %d, %d bit)\n", i, dinfo.device_vendor, dinfo.device_name, (int) freq, (int) shaders, (int) dinfo.nbits);
+       }
+
+       if (files.size() == 0)
+       {
+               quit("Syntax: opencl [-output-file OUTPUT_FILE] FILE1 [FILE2] ... [FILEn] [-- COMPILER_OPTION_1] [COMPILER_OPTION_2] ... [COMPILER_OPTION_N]");
+       }
+
+       char** buffers = (char**) malloc(files.size() * sizeof(char*));
+       if (buffers == NULL) quit("Memory allocation error\n");
+       for (unsigned int i = 0;i < files.size();i++)
+       {
+               printf("Reading source file %s\n", files[i]);
+               FILE* fp = fopen(files[i], "rb");
+               if (fp == NULL)
+               {
+                       printf("Cannot open %s\n", files[i]);
+                       return(1);
+               }
+               fseek(fp, 0, SEEK_END);
+               size_t file_size = ftell(fp);
+               fseek(fp, 0, SEEK_SET);
+
+               buffers[i] = (char*) malloc(file_size + 1);
+               if (buffers[i] == NULL)
+               {
+                       quit("Memory allocation error");
+               }
+               if (fread(buffers[i], 1, file_size, fp) != file_size)
+               {
+                       quit("Error reading file");
+               }
+               buffers[i][file_size] = 0;
+               fclose(fp);
+       }
+
+       printf("Creating OpenCL Context\n");
+       //Create OpenCL context
+       cl_context context = clCreateContext(NULL, pinfo.count, devices, NULL, NULL, &ocl_error);
+       if (ocl_error != CL_SUCCESS) quit("Error creating OpenCL context");
+
+       printf("Creating OpenCL Program Object\n");
+       //Create OpenCL program object
+       cl_program program = clCreateProgramWithSource(context, (cl_uint) files.size(), (const char**) buffers, NULL, &ocl_error);
+       if (ocl_error != CL_SUCCESS) quit("Error creating program object");
+
+       printf("Compiling OpenCL Program\n");
+       //Compile program
+       ocl_error = clBuildProgram(program, pinfo.count, devices, compiler_options.c_str(), NULL, NULL);
+       if (ocl_error != CL_SUCCESS)
+       {
+               fprintf(stderr, "OpenCL Error while building program: %d (Compiler options: %s)\n", ocl_error, compiler_options.c_str());
+               fprintf(stderr, "OpenCL Kernel:\n\n");
+               for (unsigned int i = 0;i < files.size();i++)
+               {
+                       printf("%s\n\n", buffers[i]);
+               }
+               
+               for (unsigned int i = 0;i < pinfo.count;i++)
+               {
+                       cl_build_status status;
+                       clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL);
+                       if (status == CL_BUILD_ERROR)
+                       {
+                               size_t log_size;
+                               clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+                               char* build_log = (char*) malloc(log_size + 1);
+                               if (build_log == NULL) quit("Memory allocation error");
+                               clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
+                               fprintf(stderr, "Build Log (device %d):\n\n%s\n\n", i, build_log);
+                               free(build_log);
+                       }
+               }
+       }
+       for (unsigned int i = 0;i < files.size();i++)
+       {
+               free(buffers[i]);
+       }
+       free(buffers);
+       if (ocl_error != CL_SUCCESS) return(1);
+
+       printf("Obtaining program binaries\n");
+       size_t* binary_sizes = (size_t*) malloc(pinfo.count * sizeof(size_t));
+       if (binary_sizes == NULL) quit("Memory allocation error");
+       clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, pinfo.count * sizeof(size_t), binary_sizes, NULL);
+       char** binary_buffers = (char**) malloc(pinfo.count * sizeof(char*));
+       if (binary_buffers == NULL) quit("Memory allocation error");
+       for (unsigned int i = 0;i < pinfo.count;i++)
+       {
+               printf("Binary size for device %d: %d\n", i, (int) binary_sizes[i]);
+               binary_buffers[i] = (char*) malloc(binary_sizes[i]);
+               memset(binary_buffers[i], 0, binary_sizes[i]);
+               if (binary_buffers[i] == NULL) quit("Memory allocation error");
+       }
+       clGetProgramInfo(program, CL_PROGRAM_BINARIES, pinfo.count * sizeof(char*), binary_buffers, NULL);
+
+       printf("Programs obtained successfully, cleaning up opencl\n");
+       clReleaseProgram(program);
+       clReleaseContext(context);
+
+       printf("Writing binaries to file (%s)\n", output_file);
+       FILE* fp;
+       fp = fopen(output_file, "w+b");
+       if (fp == NULL) quit("Error opening output file\n");
+       const char* magic_bytes = "QOCLPB";
+       fwrite(magic_bytes, 1, strlen(magic_bytes) + 1, fp);
+       fwrite(&pinfo, 1, sizeof(pinfo), fp);
+       for (unsigned int i = 0;i < pinfo.count;i++)
+       {
+               clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, dinfo.device_name, NULL);
+               clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, dinfo.device_vendor, NULL);
+               dinfo.binary_size = binary_sizes[i];
+               fwrite(&dinfo, 1, sizeof(dinfo), fp);
+               fwrite(binary_buffers[i], 1, binary_sizes[i], fp);
+       }
+       fclose(fp);
+
+       printf("All done, cleaning up remaining buffers\n");
+       for (unsigned int i = 0;i < pinfo.count;i++)
+       {
+               free(binary_buffers[i]);
+       }
+       free(binary_sizes);
+       free(binary_buffers);
+
+       return(0);
+}
\ No newline at end of file
diff --git a/HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_compiler_structs.h b/HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_compiler_structs.h
new file mode 100644 (file)
index 0000000..bba156e
--- /dev/null
@@ -0,0 +1,16 @@
+struct _makefiles_opencl_platform_info
+{
+       char platform_profile[64];
+       char platform_version[64];
+       char platform_name[64];
+       char platform_vendor[64];
+       cl_uint count;
+};
+
+struct _makefiles_opencl_device_info
+{
+       char device_name[64];
+       char device_vendor[64];
+       cl_uint nbits;
+       size_t binary_size;
+};
diff --git a/HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_obtain_program.h b/HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_obtain_program.h
new file mode 100644 (file)
index 0000000..4c03c68
--- /dev/null
@@ -0,0 +1,86 @@
+#ifndef MAKEFILES_OPENCL_OBTAIN_PROGRAMH
+#define MAKEFILES_OPENCL_OBTAIN_PROGRAMH
+
+#include <CL/opencl.h>
+#include <vector>
+#include "opencl_compiler_structs.h"
+
+static int _makefiles_opencl_obtain_program_helper(cl_context context, cl_uint num_devices, cl_device_id* devices, cl_program* program, char* binaries)
+{
+       const char* magic_bytes = "QOCLPB";
+       if (strncmp(magic_bytes, binaries, strlen(magic_bytes)) != 0)
+       {
+               printf("Internal error accessing opencl program\n");
+               return(1);
+       }
+       char* current_ptr = binaries + strlen(magic_bytes) + 1;
+       _makefiles_opencl_platform_info* pinfo = (_makefiles_opencl_platform_info*) current_ptr;
+       current_ptr += sizeof(_makefiles_opencl_platform_info);
+
+       if (num_devices != pinfo->count)
+       {
+               printf("Number of devices differs from number of devices in opencl program\n");
+               return(1);
+       }
+       //printf("Obtaining program for OpenCL Platform: (%s %s) %s %s\n", pinfo->platform_profile, pinfo->platform_version, pinfo->platform_vendor, pinfo->platform_name);
+
+       std::vector<size_t> program_sizes(pinfo->count);
+       std::vector<char*> program_binaries(pinfo->count);
+
+       for (unsigned int i = 0;i < pinfo->count;i++)
+       {
+               char device_name[64], device_vendor[64];
+               cl_uint nbits;
+               clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, device_name, NULL);
+               clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, device_vendor, NULL);
+               clGetDeviceInfo(devices[i], CL_DEVICE_ADDRESS_BITS, sizeof(nbits), &nbits, NULL);
+               _makefiles_opencl_device_info* dinfo = (_makefiles_opencl_device_info*) current_ptr;
+               if (strcmp(device_name, dinfo->device_name) != 0 || strcmp(device_vendor, dinfo->device_vendor) != 0)
+               {
+                       printf("Device list is different to device list from opencl program\n");
+                       return(1);
+               }
+               if (nbits != dinfo->nbits)
+               {
+                       printf("Pointer size of device and stored device binary differs\n");
+                       return(1);
+               }
+               current_ptr += sizeof(_makefiles_opencl_device_info);
+               //printf("Device %d: %s %s (size %lld)\n", i, dinfo->device_vendor, dinfo->device_name, (long long int) dinfo->binary_size);
+               program_sizes[i] = dinfo->binary_size;
+               program_binaries[i] = current_ptr;
+               current_ptr += dinfo->binary_size;
+       }
+
+       std::vector<cl_int> return_status(pinfo->count);
+       cl_int ocl_error;
+       *program = clCreateProgramWithBinary(context, num_devices, devices, program_sizes.data(), (const unsigned char**) program_binaries.data(), return_status.data(), &ocl_error);
+
+       if (ocl_error != CL_SUCCESS)
+       {
+               printf("Error loading program\n");
+               return(1);
+       }
+
+       for (unsigned int i = 0;i < pinfo->count;i++)
+       {
+               if (return_status[i] != CL_SUCCESS)
+               {
+                       printf("Error loading program for device %d\n", i);
+                       clReleaseProgram(*program);
+                       return(1);
+               }
+       }
+
+       ocl_error = clBuildProgram(*program, num_devices, devices, "", NULL, NULL);
+       if (ocl_error != CL_SUCCESS)
+       {
+               printf("Error building program\n");
+               clReleaseProgram(*program);
+               return(1);
+       }
+
+       return(0);
+}
+
+#endif
\ No newline at end of file