Add external cagpu library for GPU tracker to AliRoot
authordrohr <drohr@f7af4fe6-9843-0410-8265-dc069ae4e863>
Thu, 28 Jun 2012 10:19:13 +0000 (10:19 +0000)
committerdrohr <drohr@f7af4fe6-9843-0410-8265-dc069ae4e863>
Thu, 28 Jun 2012 10:19:13 +0000 (10:19 +0000)
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu [new file with mode: 0755]
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch [new file with mode: 0755]
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.h [new file with mode: 0755]
HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCATrackletConstructorGPU.h [new file with mode: 0755]
HLT/TPCLib/tracking-ca/cagpu/gpu_tracker_install.txt [new file with mode: 0755]
HLT/TPCLib/tracking-ca/cagpu/makefile [new file with mode: 0755]

diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu
new file mode 100755 (executable)
index 0000000..7b2874c
--- /dev/null
@@ -0,0 +1,2166 @@
+// **************************************************************************
+// This file is property of and copyright by the ALICE HLT Project          *
+// ALICE Experiment at CERN, All rights reserved.                           *
+//                                                                          *
+// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
+//                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
+//                                     David Rohr <drohr@kip.uni-heidelberg.de>                                *
+//                  for The ALICE HLT Project.                              *
+//                                                                          *
+// Permission to use, copy, modify and distribute this software and its     *
+// documentation strictly for non-commercial purposes is hereby granted     *
+// without fee, provided that the above copyright notice appears in all     *
+// copies and that both the copyright notice and this permission notice     *
+// appear in the supporting documentation. The authors make no claims       *
+// about the suitability of this software for any purpose. It is            *
+// provided "as is" without express or implied warranty.                    *
+//                                                                          *
+//***************************************************************************
+
+#define HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT 36
+#define FERMI
+#include "AliHLTTPCCAGPUTrackerNVCC.h"
+
+#ifdef HLTCA_GPUCODE
+#include <cuda.h>
+#include <sm_11_atomic_functions.h>
+#include <sm_12_atomic_functions.h>
+#endif
+
+#ifdef R__WIN32
+#else
+#include <sys/syscall.h>
+#include <semaphore.h>
+#include <fcntl.h>
+#endif
+#include "AliHLTTPCCADef.h"
+#include "AliHLTTPCCAGPUConfig.h"
+
+#if defined(HLTCA_STANDALONE) & !defined(_WIN32)
+#include <sched.h>
+#endif
+
+#include <iostream>
+#include <fstream>
+
+//Disable assertions since they produce errors in GPU Code
+#ifdef assert
+#undef assert
+#endif
+#define assert(param)
+
+__constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
+#ifdef HLTCA_GPU_TEXTURE_FETCH
+texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
+texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
+texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
+#endif
+
+//Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
+#include "AliHLTTPCCATrackParam.cxx"
+#include "AliHLTTPCCATrack.cxx" 
+
+#include "AliHLTTPCCAHitArea.cxx"
+#include "AliHLTTPCCAGrid.cxx"
+#include "AliHLTTPCCARow.cxx"
+#include "AliHLTTPCCAParam.cxx"
+#include "AliHLTTPCCATracker.cxx"
+
+#include "AliHLTTPCCAProcess.h"
+
+#include "AliHLTTPCCATrackletSelector.cxx"
+#include "AliHLTTPCCANeighboursFinder.cxx"
+#include "AliHLTTPCCANeighboursCleaner.cxx"
+#include "AliHLTTPCCAStartHitsFinder.cxx"
+#include "AliHLTTPCCAStartHitsSorter.cxx"
+#include "AliHLTTPCCATrackletConstructor.cxx"
+
+#ifdef HLTCA_GPU_MERGER
+#include "AliHLTTPCGMMerger.h"
+#include "AliHLTTPCGMTrackParam.cxx"
+#endif
+
+#include "MemoryAssignmentHelpers.h"
+
+#ifndef HLTCA_STANDALONE
+#include "AliHLTDefinitions.h"
+#include "AliHLTSystem.h"
+#endif
+
+#define RANDOM_ERROR
+//#define RANDOM_ERROR || rand() % 500 == 1
+
+ClassImp( AliHLTTPCCAGPUTrackerNVCC )
+
+int AliHLTTPCCAGPUTrackerNVCC::GlobalTracking(int iSlice, int threadId, AliHLTTPCCAGPUTrackerNVCC::helperParam* hParam)
+{
+       if (fDebugLevel >= 3) printf("GPU Tracker running Global Tracking for slice %d on thread %d\n", iSlice, threadId);
+
+       int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
+       int sliceRight = (iSlice + 1) % (fgkNSlices / 2);
+       if (iSlice >= fgkNSlices / 2)
+       {
+               sliceLeft += fgkNSlices / 2;
+               sliceRight += fgkNSlices / 2;
+       }
+       while (fSliceOutputReady < iSlice || fSliceOutputReady < sliceLeft || fSliceOutputReady < sliceRight)
+       {
+               if (hParam != NULL && hParam->fReset) return(1);
+       }
+
+       pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
+       pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
+       fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS);
+       pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
+       pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
+
+       fSliceLeftGlobalReady[sliceLeft] = 1;
+       fSliceRightGlobalReady[sliceRight] = 1;
+       if (fDebugLevel >= 3) printf("GPU Tracker finished Global Tracking for slice %d on thread %d\n", iSlice, threadId);
+       return(0);
+}
+
+void* AliHLTTPCCAGPUTrackerNVCC::helperWrapper(void* arg)
+{
+       AliHLTTPCCAGPUTrackerNVCC::helperParam* par = (AliHLTTPCCAGPUTrackerNVCC::helperParam*) arg;
+       AliHLTTPCCAGPUTrackerNVCC* cls = par->fCls;
+
+       AliHLTTPCCATracker* tmpTracker = new AliHLTTPCCATracker;
+
+#ifdef HLTCA_STANDALONE
+       if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d starting", par->fNum);
+#endif
+
+#if defined(HLTCA_STANDALONE) & !defined(_WIN32)
+       cpu_set_t mask;
+       CPU_ZERO(&mask);
+       CPU_SET(par->fNum * 2 + 2, &mask);
+       //sched_setaffinity(0, sizeof(mask), &mask);
+#endif
+
+       while(pthread_mutex_lock(&((pthread_mutex_t*) par->fMutex)[0]) == 0 && par->fTerminate == false)
+       {
+               if (par->CPUTracker)
+               {
+                       for (int i = 0;i < cls->fNSlicesPerCPUTracker;i++)
+                       {
+                               int myISlice = cls->fSliceCount - cls->fNCPUTrackers * cls->fNSlicesPerCPUTracker + (par->fNum - cls->fNHelperThreads) * cls->fNSlicesPerCPUTracker + i;
+#ifdef HLTCA_STANDALONE
+                               if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Doing full CPU tracking, Slice %d", par->fNum, myISlice);
+#endif
+                               if (myISlice >= 0)
+                               {
+                                       tmpTracker->Initialize(cls->fSlaveTrackers[par->fFirstSlice + myISlice].Param());
+                                       tmpTracker->ReadEvent(&par->pClusterData[myISlice]);
+                                       tmpTracker->DoTracking();
+                                       tmpTracker->SetOutput(&par->pOutput[myISlice]);
+                                       pthread_mutex_lock((pthread_mutex_t*) cls->fHelperMemMutex);
+                                       tmpTracker->WriteOutputPrepare();
+                                       pthread_mutex_unlock((pthread_mutex_t*) cls->fHelperMemMutex);
+                                       tmpTracker->WriteOutput();
+
+                                       /*cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUSliceDataMemory((char*) new uint4[HLTCA_GPU_SLICE_DATA_MEMORY/sizeof(uint4)], (char*) new uint4[HLTCA_GPU_ROWS_MEMORY/sizeof(uint4)]);
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].ReadEvent(&par->pClusterData[myISlice]);
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracklets(HLTCA_GPU_MAX_TRACKLETS);
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersHits(par->pClusterData[myISlice].NumberOfClusters());
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracks(HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, cls->fConstructorBlockCount);
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemorySize()/sizeof( uint4 ) + 100]), par->pClusterData[myISlice].NumberOfClusters());
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTracksMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemorySize()/sizeof( uint4 ) + 100]), HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
+                                       cls->fSlaveTrackers[par->fFirstSlice + myISlice].DoTracking();
+                                       cls->WriteOutput(par->pOutput, par->fFirstSlice, myISlice, par->fNum + 1);
+                                       delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemory();
+                                       delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemory();
+                                       delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemory();*/
+                               }
+#ifdef HLTCA_STANDALONE
+                               if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d", par->fNum, myISlice);
+#endif
+                       }
+               }
+               else
+               {
+                       int mustRunSlice19 = 0;
+                       for (int i = par->fNum + 1;i < par->fSliceCount;i += cls->fNHelperThreads + 1)
+                       {
+                               //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Running, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
+                               if (par->fPhase)
+                               {
+                                       if (cls->fUseGlobalTracking)
+                                       {
+                                               int realSlice = i + 1;
+                                               if (realSlice % (fgkNSlices / 2) < 1) realSlice -= fgkNSlices / 2;
+
+                                               if (realSlice % (fgkNSlices / 2) != 1)
+                                               {
+                                                       cls->GlobalTracking(realSlice, par->fNum + 1, par);
+                                               }
+
+                                               if (realSlice == 19)
+                                               {
+                                                       mustRunSlice19 = 1;
+                                               }
+                                               else
+                                               {
+                                                       while (cls->fSliceLeftGlobalReady[realSlice] == 0 || cls->fSliceRightGlobalReady[realSlice] == 0)
+                                                       {
+                                                               if (par->fReset) goto ResetHelperThread;
+                                                       }
+                                                       cls->WriteOutput(par->pOutput, par->fFirstSlice, realSlice, par->fNum + 1);
+                                               }
+                                       }
+                                       else
+                                       {
+                                               while (cls->fSliceOutputReady < i)
+                                               {
+                                                       if (par->fReset) goto ResetHelperThread;
+                                               }
+                                               cls->WriteOutput(par->pOutput, par->fFirstSlice, i, par->fNum + 1);
+                                       }
+                               }
+                               else
+                               {
+                                       cls->ReadEvent(par->pClusterData, par->fFirstSlice, i, par->fNum + 1);
+                                       par->fDone = i + 1;
+                               }
+                               //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
+                       }
+                       if (mustRunSlice19)
+                       {
+                               while (cls->fSliceLeftGlobalReady[19] == 0 || cls->fSliceRightGlobalReady[19] == 0)
+                               {
+                                       if (par->fReset) goto ResetHelperThread;
+                               }
+                               cls->WriteOutput(par->pOutput, par->fFirstSlice, 19, par->fNum + 1);
+                       }
+               }
+ResetHelperThread:
+               cls->ResetThisHelperThread(par);
+       }
+#ifdef HLTCA_STANDALONE
+       if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d terminating", par->fNum);
+#endif
+       delete tmpTracker;
+       pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
+       pthread_exit(NULL);
+       return(NULL);
+}
+
+void AliHLTTPCCAGPUTrackerNVCC::ResetThisHelperThread(AliHLTTPCCAGPUTrackerNVCC::helperParam* par)
+{
+       if (par->fReset) HLTImportant("GPU Helper Thread %d reseting", par->fNum);
+       par->fReset = false;
+       pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
+}
+
+#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
+
+AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :
+fGpuTracker(NULL),
+fGPUMemory(NULL),
+fHostLockedMemory(NULL),
+fGPUMergerMemory(NULL),
+fGPUMergerHostMemory(NULL),
+fGPUMergerMaxMemory(0),
+fDebugLevel(0),
+fDebugMask(0xFFFFFFFF),
+fOutFile(NULL),
+fGPUMemSize(0),
+fpCudaStreams(NULL),
+fSliceCount(HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT),
+fCudaDevice(0),
+fOutputControl(NULL),
+fThreadId(0),
+fCudaInitialized(0),
+fPPMode(0),
+fSelfheal(0),
+fConstructorBlockCount(30),
+selectorBlockCount(30),
+fCudaContext(NULL),
+fNHelperThreads(HLTCA_GPU_DEFAULT_HELPER_THREADS),
+fHelperParams(NULL),
+fHelperMemMutex(NULL),
+fSliceOutputReady(0),
+fSliceGlobalMutexes(NULL),
+fNCPUTrackers(0),
+fNSlicesPerCPUTracker(0),
+fGlobalTracking(0),
+fUseGlobalTracking(0),
+fNSlaveThreads(0)
+{
+       fCudaContext = (void*) new CUcontext;
+};
+
+AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC()
+{
+       delete (CUcontext*) fCudaContext;
+};
+
+void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)
+{
+       //Release the global named semaphore that locks GPU Initialization
+#ifdef R__WIN32
+       HANDLE* h = (HANDLE*) sem;
+       ReleaseSemaphore(*h, 1, NULL);
+       CloseHandle(*h);
+       delete h;
+#else
+       sem_t* pSem = (sem_t*) sem;
+       sem_post(pSem);
+       sem_unlink(SemLockName);
+#endif
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)
+{
+       //Check constants for correct memory sizes
+       if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
+       {
+               HLTError("Insufficiant Tracker Object Memory for %d slices", sliceCount);
+               return(1);
+       }
+
+       if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
+       {
+               HLTError("Insufficiant Common Memory");
+               return(1);
+       }
+
+       if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
+       {
+               HLTError("Insufficiant Row Memory");
+               return(1);
+       }
+
+       if (fDebugLevel >= 3)
+       {
+               HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", (int) sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, (int) (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize()), HLTCA_GPU_COMMON_MEMORY, (int) (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow)), HLTCA_GPU_ROWS_MEMORY);
+       }
+       return(0);
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
+{
+       //Find best CUDA device, initialize and allocate memory
+
+#if defined(HLTCA_STANDALONE) & !defined(_WIN32)
+       cpu_set_t mask;
+       CPU_ZERO(&mask);
+       CPU_SET(0, &mask);
+       //sched_setaffinity(0, sizeof(mask), &mask);
+#endif
+
+       if (sliceCount == -1) sliceCount = fSliceCount;
+
+       if (CheckMemorySizes(sliceCount)) return(1);
+
+#ifdef R__WIN32
+       HANDLE* semLock = new HANDLE;
+       *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
+       if (*semLock == NULL)
+       {
+               HLTError("Error creating GPUInit Semaphore");
+               return(1);
+       }
+       WaitForSingleObject(*semLock, INFINITE);
+#else
+       sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
+       if (semLock == SEM_FAILED)
+       {
+               HLTError("Error creating GPUInit Semaphore");
+               return(1);
+       }
+       timespec semtime;
+       clock_gettime(CLOCK_REALTIME, &semtime);
+       semtime.tv_sec += 10;
+       while (sem_timedwait(semLock, &semtime) != 0)
+       {
+               HLTError("Global Lock for GPU initialisation was not released for 10 seconds, assuming another thread died");
+               HLTWarning("Resetting the global lock");
+               sem_post(semLock);
+       }
+#endif
+
+       fThreadId = GetThread();
+
+       cudaDeviceProp fCudaDeviceProp;
+
+       fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
+
+#ifdef HLTCA_GPU_MERGER
+       fGPUMergerMaxMemory = 2000000 * 5 * sizeof(float);
+       fGPUMemSize += fGPUMergerMaxMemory;
+#endif
+
+#ifndef CUDA_DEVICE_EMULATION
+       int count, bestDevice = -1;
+       long long int bestDeviceSpeed = 0, deviceSpeed;
+       if (CudaFailedMsg(cudaGetDeviceCount(&count)))
+       {
+               HLTError("Error getting CUDA Device Count");
+               ReleaseGlobalLock(semLock);
+               return(1);
+       }
+       if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
+#ifdef FERMI
+       const int reqVerMaj = 2;
+       const int reqVerMin = 0;
+#else
+       const int reqVerMaj = 1;
+       const int reqVerMin = 2;
+#endif
+       for (int i = 0;i < count;i++)
+       {
+               if (fDebugLevel >= 4) printf("Examining device %d\n", i);
+#if CUDA_VERSION > 3010
+               size_t free, total;
+#else
+               unsigned int free, total;
+#endif
+               cuInit(0);
+               CUdevice tmpDevice;
+               cuDeviceGet(&tmpDevice, i);
+               CUcontext tmpContext;
+               cuCtxCreate(&tmpContext, 0, tmpDevice);
+               if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
+               cuCtxDestroy(tmpContext);
+               if (fDebugLevel >= 4) printf("Obtained current memory usage for device %d\n", i);
+               if (CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;
+               if (fDebugLevel >= 4) printf("Obtained device properties for device %d\n", i);
+               int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < reqVerMaj || (fCudaDeviceProp.major == reqVerMaj && fCudaDeviceProp.minor < reqVerMin)) && free >= fGPUMemSize + 100 * 1024 + 1024;
+#ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
+               //if (sliceCount > fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER) deviceOK = 0;
+#endif
+
+               if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %lld / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, (long long int) free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
+               deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free * (long long int) fCudaDeviceProp.major * (long long int) fCudaDeviceProp.major;
+               if (deviceOK && deviceSpeed > bestDeviceSpeed)
+               {
+                       bestDevice = i;
+                       bestDeviceSpeed = deviceSpeed;
+               }
+       }
+       if (bestDevice == -1)
+       {
+               HLTWarning("No %sCUDA Device available, aborting CUDA Initialisation", count ? "appropriate " : "");
+               HLTInfo("Requiring Revision %d.%d, Mem: %lld, Multiprocessors: %d", reqVerMaj, reqVerMin, fGPUMemSize + 100 * 1024 * 1024, sliceCount);
+               ReleaseGlobalLock(semLock);
+               return(1);
+       }
+
+       if (forceDeviceID == -1)
+               fCudaDevice = bestDevice;
+       else
+               fCudaDevice = forceDeviceID;
+#else
+       fCudaDevice = 0;
+#endif
+
+       cudaGetDeviceProperties(&fCudaDeviceProp ,fCudaDevice ); 
+
+       if (fDebugLevel >= 1)
+       {
+               HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
+               HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);
+               HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);
+               HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
+               HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
+               HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);
+               HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
+               HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
+               HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
+               HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);
+               HLTInfo("major = %d", fCudaDeviceProp.major);
+               HLTInfo("minor = %d", fCudaDeviceProp.minor);
+               HLTInfo("clockRate = %d", fCudaDeviceProp.clockRate);
+               HLTInfo("memoryClockRate = %d", fCudaDeviceProp.memoryClockRate);
+               HLTInfo("multiProcessorCount = %d", fCudaDeviceProp.multiProcessorCount);
+               HLTInfo("textureAlignment = %lld", (unsigned long long int) fCudaDeviceProp.textureAlignment);
+       }
+       fConstructorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER;
+       selectorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_SELECTOR_MULTIPLIER;
+
+       if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
+       {
+               HLTError( "Unsupported CUDA Device" );
+               ReleaseGlobalLock(semLock);
+               return(1);
+       }
+
+       if (cuCtxCreate((CUcontext*) fCudaContext, CU_CTX_SCHED_AUTO, fCudaDevice) != CUDA_SUCCESS)
+       {
+               HLTError("Could not set CUDA Device!");
+               ReleaseGlobalLock(semLock);
+               return(1);
+       }
+
+       if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
+       {
+               HLTError("CUDA Memory Allocation Error");
+               cudaThreadExit();
+               ReleaseGlobalLock(semLock);
+               return(1);
+       }
+       fGPUMergerMemory = ((char*) fGPUMemory) + fGPUMemSize - fGPUMergerMaxMemory;
+       ReleaseGlobalLock(semLock);
+       if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
+       int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
+#ifdef HLTCA_GPU_MERGER
+       hostMemSize += fGPUMergerMaxMemory;
+#endif
+       if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
+       {
+               cudaFree(fGPUMemory);
+               cudaThreadExit();
+               HLTError("Error allocating Page Locked Host Memory");
+               return(1);
+       }
+       if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
+       fGPUMergerHostMemory = ((char*) fHostLockedMemory) + hostMemSize - fGPUMergerMaxMemory;
+
+       if (fDebugLevel >= 1)
+       {
+               CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
+       }
+
+       fSliceCount = sliceCount;
+       //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
+       fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
+
+       for (int i = 0;i < fgkNSlices;i++)
+       {
+               fSlaveTrackers[i].SetGPUTracker();
+               fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
+               fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
+       }
+
+       fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
+       cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
+       for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
+       {
+               if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
+               {
+                       cudaFree(fGPUMemory);
+                       cudaFreeHost(fHostLockedMemory);
+                       cudaThreadExit();
+                       HLTError("Error creating CUDA Stream");
+                       return(1);
+               }
+       }
+
+       if (StartHelperThreads()) return(1);
+
+       fHelperMemMutex = malloc(sizeof(pthread_mutex_t));
+       if (fHelperMemMutex == NULL)
+       {
+               HLTError("Memory allocation error");
+               cudaFree(fGPUMemory);
+               cudaFreeHost(fHostLockedMemory);
+               cudaThreadExit();
+               return(1);
+       }
+
+       if (pthread_mutex_init((pthread_mutex_t*) fHelperMemMutex, NULL))
+       {
+               HLTError("Error creating pthread mutex");
+               cudaFree(fGPUMemory);
+               cudaFreeHost(fHostLockedMemory);
+               cudaThreadExit();
+               return(1);
+       }
+
+       fSliceGlobalMutexes = malloc(sizeof(pthread_mutex_t) * fgkNSlices);
+       if (fSliceGlobalMutexes == NULL)
+       {
+               HLTError("Memory allocation error");
+               cudaFree(fGPUMemory);
+               cudaFreeHost(fHostLockedMemory);
+               cudaThreadExit();
+               return(1);
+       }
+       for (int i = 0;i < fgkNSlices;i++)
+       {
+               if (pthread_mutex_init(&((pthread_mutex_t*) fSliceGlobalMutexes)[i], NULL))
+               {
+                       HLTError("Error creating pthread mutex");
+                       cudaFree(fGPUMemory);
+                       cudaFreeHost(fHostLockedMemory);
+                       cudaThreadExit();
+                       return(1);
+               }
+       }
+
+       cuCtxPopCurrent((CUcontext*) fCudaContext);
+       fCudaInitialized = 1;
+       HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", fCudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount);
+
+#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
+       if (fDebugLevel < 2 && 0)
+       {
+               //Do one initial run for Benchmark reasons
+               const int useDebugLevel = fDebugLevel;
+               fDebugLevel = 0;
+               AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
+
+               std::ifstream fin;
+
+               AliHLTTPCCAParam tmpParam;
+               AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
+
+               fin.open("events/settings.dump");
+               int tmpCount;
+               fin >> tmpCount;
+               for (int i = 0;i < sliceCount;i++)
+               {
+                       fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
+                       tmpParam.ReadSettings(fin);
+                       InitializeSliceParam(i, tmpParam);
+               }
+               fin.close();
+
+               fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
+               for (int i = 0;i < sliceCount;i++)
+               {
+                       tmpCluster[i].StartReading(i, 0);
+                       tmpCluster[i].ReadEvent(fin);
+               }
+               fin.close();
+
+               AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
+               memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
+
+               Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
+               for (int i = 0;i < sliceCount;i++)
+               {
+                       free(tmpOutput[i]);
+                       tmpOutput[i] = NULL;
+                       fSlaveTrackers[i].SetOutputControl(NULL);
+               }
+               delete[] tmpOutput;
+               delete[] tmpCluster;
+               fDebugLevel = useDebugLevel;
+       }
+#endif
+
+       return(0);
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::StartHelperThreads()
+{
+       int nThreads = fNHelperThreads + fNCPUTrackers;
+       if (nThreads)
+       {
+               fHelperParams = new helperParam[nThreads];
+               if (fHelperParams == NULL)
+               {
+                       HLTError("Memory allocation error");
+                       cudaFree(fGPUMemory);
+                       cudaFreeHost(fHostLockedMemory);
+                       cudaThreadExit();
+                       return(1);
+               }       
+               for (int i = 0;i < nThreads;i++)
+               {
+                       fHelperParams[i].fCls = this;
+                       fHelperParams[i].fTerminate = false;
+                       fHelperParams[i].fReset = false;
+                       fHelperParams[i].fNum = i;
+                       fHelperParams[i].fMutex = malloc(2 * sizeof(pthread_mutex_t));
+                       if (fHelperParams[i].fMutex == NULL)
+                       {
+                               HLTError("Memory allocation error");
+                               cudaFree(fGPUMemory);
+                               cudaFreeHost(fHostLockedMemory);
+                               cudaThreadExit();
+                               return(1);
+                       }
+                       for (int j = 0;j < 2;j++)
+                       {
+                               if (pthread_mutex_init(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j], NULL))
+                               {
+                                       HLTError("Error creating pthread mutex");
+                                       cudaFree(fGPUMemory);
+                                       cudaFreeHost(fHostLockedMemory);
+                                       cudaThreadExit();
+                                       return(1);
+                               }
+
+                               pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
+                       }
+                       fHelperParams[i].fThreadId = (void*) malloc(sizeof(pthread_t));
+
+                       if (pthread_create((pthread_t*) fHelperParams[i].fThreadId, NULL, helperWrapper, &fHelperParams[i]))
+                       {
+                               HLTError("Error starting slave thread");
+                               cudaFree(fGPUMemory);
+                               cudaFreeHost(fHostLockedMemory);
+                               cudaThreadExit();
+                       }
+               }
+       }
+       fNSlaveThreads = nThreads;
+       return(0);
+}
+
+template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)
+{
+       //Macro to align Pointers.
+       //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
+       //(As long as every single data structure is <= 1 MB)
+
+       size_t adr = (size_t) ptr;
+       if (adr % alignment)
+       {
+               adr += alignment - (adr % alignment);
+       }
+       return((T*) adr);
+}
+
+bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsgA(cudaError_t error, const char* file, int line)
+{
+       //Check for CUDA Error and in the case of an error display the corresponding error string
+       if (error == cudaSuccess) return(false);
+       HLTWarning("CUDA Error: %d / %s (%s:%d)", error, cudaGetErrorString(error), file, line);
+       return(true);
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice)
+{
+       //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
+
+       if (fDebugLevel == 0) return(0);
+       cudaError cuErr;
+       cuErr = cudaGetLastError();
+       if (cuErr != cudaSuccess)
+       {
+               HLTError("Cuda Error %s while running kernel (%s) (Slice %d; %d/%d)", cudaGetErrorString(cuErr), state, sliceLocal, slice, fgkNSlices);
+               return(1);
+       }
+       if (CudaFailedMsg(cudaThreadSynchronize()))
+       {
+               HLTError("CUDA Error while synchronizing (%s) (Slice %d; %d/%d)", state, sliceLocal, slice, fgkNSlices);
+               return(1);
+       }
+       if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
+       return(0);
+}
+
+void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
+{
+       //Set Debug Level and Debug output File if applicable
+       fDebugLevel = dwLevel;
+       if (NewOutFile) fOutFile = NewOutFile;
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int OptionValue)
+{
+       //Set a specific GPU Tracker Option
+       if (strcmp(OptionName, "PPMode") == 0)
+       {
+               fPPMode = OptionValue;
+       }
+       else if (strcmp(OptionName, "DebugMask") == 0)
+       {
+               fDebugMask = OptionValue;
+       }
+       else if (strcmp(OptionName, "HelperThreads") == 0)
+       {
+               fNHelperThreads = OptionValue;
+       }
+       else if (strcmp(OptionName, "CPUTrackers") == 0)
+       {
+               fNCPUTrackers = OptionValue;
+       }
+       else if (strcmp(OptionName, "SlicesPerCPUTracker") == 0)
+       {
+               fNSlicesPerCPUTracker = OptionValue;
+       }
+       else if (strcmp(OptionName, "GlobalTracking") == 0)
+       {
+               fGlobalTracking = OptionValue;
+       }
+       else
+       {
+               HLTError("Unknown Option: %s", OptionName);
+               return(1);
+       }
+
+       if (fNHelperThreads + fNCPUTrackers > fNSlaveThreads && fCudaInitialized)
+       {
+               HLTInfo("Insufficient Slave Threads available (%d), creating additional Slave Threads (%d+%d)\n", fNSlaveThreads, fNHelperThreads, fNCPUTrackers);
+               StopHelperThreads();
+               StartHelperThreads();
+       }
+
+       return(0);
+}
+
+#ifdef HLTCA_STANDALONE
+void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)
+{
+       //Run Performance Query for timer i of slice iSlice
+       if (fDebugLevel >= 1)
+       {
+               AliHLTTPCCATracker::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
+       }
+}
+#else
+void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
+#endif
+
+#if defined(BITWISE_COMPATIBLE_DEBUG_OUTPUT) || defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER)
+void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker*, int, bool) {}
+#else
+void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
+{
+       //Dump Rowblocks to File
+       if (fDebugLevel >= 4)
+       {
+               *fOutFile << "RowBlock Tracklets (Slice" << tracker[iSlice].Param().ISlice() << " (" << iSlice << " of reco))";
+               *fOutFile << " after Tracklet Reconstruction";
+               *fOutFile << std::endl;
+
+               int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
+               int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
+               uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * fConstructorBlockCount);
+               CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
+               CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
+               CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * fConstructorBlockCount, cudaMemcpyDeviceToHost));
+               CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
+
+               int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
+               for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
+               {
+                       *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << 
+                               rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
+                       for (int j = 0;j < rowBlockPos[i].x;j++)
+                       {
+                               //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
+                               *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
+#ifdef HLTCA_GPU_SCHED_FIXED_START
+                               if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
+                               {
+                                       HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
+                               }
+#endif //HLTCA_GPU_SCHED_FIXED_START
+                               k++;
+                               if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
+                               {
+                                       HLTError("Error, -1 Tracklet found");
+                               }
+                       }
+                       *fOutFile << std::endl << "Phase 2: ";
+                       for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
+                       {
+                               *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
+                       }
+                       *fOutFile << std::endl;
+               }
+
+               if (check)
+               {
+                       *fOutFile << "Starting Threads: (Slice" << tracker[iSlice].Param().ISlice() << ", First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
+                       for (int i = 0;i < fConstructorBlockCount;i++)
+                       {
+                               *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
+                       }
+               }
+
+               free(rowBlockPos);
+               free(rowBlockTracklets);
+               free(blockStartingTracklet);
+       }
+}
+#endif
+
+__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
+{
+       //Initialize GPU RowBlocks and HitWeights
+       int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
+       const int stride = blockDim.x * gridDim.x;
+       int4 i0;
+       i0.x = i0.y = i0.z = i0.w = 0;
+#ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
+       int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
+       int4 i1;
+       i1.x = i1.y = i1.z = i1.w = -1;
+       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
+               RowBlockPos[i] = i0;
+       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
+               rowBlockTracklets4[i] = i1;
+#endif
+       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
+               sliceDataHitWeights4[i] = i0;
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
+{
+       if (!fSelfheal)
+       {
+               cuCtxPopCurrent((CUcontext*) fCudaContext);
+               return(1);
+       }
+       static bool selfHealing = false;
+       if (selfHealing)
+       {
+               HLTError("Selfhealing failed, giving up");
+               cuCtxPopCurrent((CUcontext*) fCudaContext);
+               return(1);
+       }
+       else
+       {
+               HLTError("Unsolvable CUDA error occured, trying to reinitialize GPU");
+       }                       
+       selfHealing = true;
+       ExitGPU();
+       if (InitGPU(fSliceCount, fCudaDevice))
+       {
+               HLTError("Could not reinitialize CUDA device, disabling GPU tracker");
+               ExitGPU();
+               return(1);
+       }
+       HLTInfo("GPU tracker successfully reinitialized, restarting tracking");
+       int retVal = Reconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal);
+       selfHealing = false;
+       return(retVal);
+}
+
+void AliHLTTPCCAGPUTrackerNVCC::ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId)
+{
+       fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
+#ifdef HLTCA_GPU_TIME_PROFILE
+       unsigned long long int a, b;
+       AliHLTTPCCATracker::StandaloneQueryTime(&a);
+#endif
+       fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
+#ifdef HLTCA_GPU_TIME_PROFILE
+       AliHLTTPCCATracker::StandaloneQueryTime(&b);
+       printf("Read %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
+#endif
+}
+
+void AliHLTTPCCAGPUTrackerNVCC::WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId)
+{
+       if (fDebugLevel >= 3) printf("GPU Tracker running WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);
+       fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
+#ifdef HLTCA_GPU_TIME_PROFILE
+       unsigned long long int a, b;
+       AliHLTTPCCATracker::StandaloneQueryTime(&a);
+#endif
+       if (fNHelperThreads) pthread_mutex_lock((pthread_mutex_t*) fHelperMemMutex);
+       fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare();
+       if (fNHelperThreads) pthread_mutex_unlock((pthread_mutex_t*) fHelperMemMutex);
+       fSlaveTrackers[firstSlice + iSlice].WriteOutput();
+#ifdef HLTCA_GPU_TIME_PROFILE
+       AliHLTTPCCATracker::StandaloneQueryTime(&b);
+       printf("Write %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
+#endif
+       if (fDebugLevel >= 3) printf("GPU Tracker finished WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
+{
+       //Primary reconstruction function
+
+       cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
+
+       if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
+
+       if (!fCudaInitialized)
+       {
+               HLTError("GPUTracker not initialized");
+               return(1);
+       }
+       if (sliceCountLocal > fSliceCount)
+       {
+               HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
+               return(1);
+       }
+       if (fThreadId != GetThread())
+       {
+               HLTWarning("CUDA thread changed, migrating context, Previous Thread: %d, New Thread: %d", fThreadId, GetThread());
+               fThreadId = GetThread();
+       }
+
+       if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
+
+       if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
+       {
+               HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
+               return(1);
+       }
+       
+       cuCtxPushCurrent(*((CUcontext*) fCudaContext));
+       if (fPPMode)
+       {
+               int retVal = ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal);
+               cuCtxPopCurrent((CUcontext*) fCudaContext);
+               return(retVal);
+       }
+
+       for (int i = fNHelperThreads;i < fNCPUTrackers + fNHelperThreads;i++)
+       {
+               fHelperParams[i].CPUTracker = 1;
+               fHelperParams[i].pClusterData = pClusterData;
+               fHelperParams[i].pOutput = pOutput;
+               fHelperParams[i].fSliceCount = sliceCountLocal;
+               fHelperParams[i].fFirstSlice = firstSlice;
+               pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
+       }
+       sliceCountLocal -= fNCPUTrackers * fNSlicesPerCPUTracker;
+       if (sliceCountLocal < 0) sliceCountLocal = 0;
+
+       fUseGlobalTracking = fGlobalTracking && sliceCountLocal == fgkNSlices;
+
+       memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
+
+       if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
+
+#ifdef HLTCA_GPU_TIME_PROFILE
+       AliHLTTPCCATracker::StandaloneQueryFreq(&fProfTimeC);
+       AliHLTTPCCATracker::StandaloneQueryTime(&fProfTimeD);
+#endif
+
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               //Make this a GPU Tracker
+               fGpuTracker[iSlice].SetGPUTracker();
+               fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
+               fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
+               fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
+
+               //Set Pointers to GPU Memory
+               char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
+
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);
+
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);
+
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);
+
+               if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY RANDOM_ERROR)
+               {
+                       HLTError("Insufficiant Track Memory");
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       ResetHelperThreads(0);
+                       return(1);
+               }
+
+               if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY RANDOM_ERROR)
+               {
+                       HLTError("Insufficiant Global Memory");
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       ResetHelperThreads(0);
+                       return(1);
+               }
+
+               if (fDebugLevel >= 3)
+               {
+                       HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", (int) (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice)), HLTCA_GPU_GLOBAL_MEMORY, (int) fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY);
+               }
+
+               //Initialize Startup Constants
+               *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
+               *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
+               *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = sliceCountLocal > fConstructorBlockCount ? (iSlice < fConstructorBlockCount) : fConstructorBlockCount * (iSlice + 1) / sliceCountLocal - fConstructorBlockCount * (iSlice) / sliceCountLocal;
+               if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
+               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
+               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fNextTracklet = (fConstructorBlockCount / sliceCountLocal + (fConstructorBlockCount % sliceCountLocal > iSlice)) * HLTCA_GPU_THREAD_COUNT;
+               fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
+       }
+
+#ifdef HLTCA_GPU_TEXTURE_FETCH
+       cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
+       size_t offset;
+       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
+       {
+               HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
+               cudaThreadSynchronize();
+               cuCtxPopCurrent((CUcontext*) fCudaContext);
+               ResetHelperThreads(0);
+               return(1);
+       }
+       cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
+       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
+       {
+               HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
+               cudaThreadSynchronize();
+               cuCtxPopCurrent((CUcontext*) fCudaContext);
+               ResetHelperThreads(0);
+               return(1);
+       }
+       cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
+       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
+       {
+               HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
+               cudaThreadSynchronize();
+               cuCtxPopCurrent((CUcontext*) fCudaContext);
+               ResetHelperThreads(0);
+               return(1);
+       }
+#endif
+
+       //Copy Tracker Object to GPU Memory
+       if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+       char* tmpMem;
+       if (CudaFailedMsg(cudaMalloc(&tmpMem, 100000000)))
+       {
+               HLTError("Error allocating CUDA profile memory");
+               cudaThreadSynchronize();
+               cuCtxPopCurrent((CUcontext*) fCudaContext);
+               ResetHelperThreads(0);
+               return(1);
+       }
+       fGpuTracker[0].fStageAtSync = tmpMem;
+       CudaFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));
+#endif
+       CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
+       if (CUDASync("Initialization (1)", 0, firstSlice) RANDOM_ERROR)
+       {
+               cudaThreadSynchronize();
+               cuCtxPopCurrent((CUcontext*) fCudaContext);
+               ResetHelperThreads(0);
+               return(1);
+       }
+
+       for (int i = 0;i < fNHelperThreads;i++)
+       {
+               fHelperParams[i].CPUTracker = 0;
+               fHelperParams[i].fDone = 0;
+               fHelperParams[i].fPhase = 0;
+               fHelperParams[i].pClusterData = pClusterData;
+               fHelperParams[i].fSliceCount = sliceCountLocal;
+               fHelperParams[i].fFirstSlice = firstSlice;
+               pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
+       }
+
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               StandalonePerfTime(firstSlice + iSlice, 0);
+
+               //Initialize GPU Slave Tracker
+               if (fDebugLevel >= 3) HLTInfo("Creating Slice Data (Slice %d)", iSlice);
+               if (iSlice % (fNHelperThreads + 1) == 0)
+               {
+                       ReadEvent(pClusterData, firstSlice, iSlice, 0);
+               }
+               else
+               {
+                       if (fDebugLevel >= 3) HLTInfo("Waiting for helper thread %d", iSlice % (fNHelperThreads + 1) - 1);
+                       while(fHelperParams[iSlice % (fNHelperThreads + 1) - 1].fDone < iSlice);
+               }
+
+               if (fDebugLevel >= 4)
+               {
+#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
+                       *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl;
+#endif
+                       if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);
+               }
+
+               if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY RANDOM_ERROR)
+               {
+                       HLTError("Insufficiant Slice Data Memory");
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+
+               if (fDebugLevel >= 3)
+               {
+                       HLTInfo("GPU Slice Data Memory Used: %d/%d", (int) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY);
+               }
+
+               //Initialize temporary memory where needed
+               if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");           
+               PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
+               if (CUDASync("Initialization (2)", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+
+               //Copy Data to GPU Global Memory
+               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
+               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
+               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
+
+               if (fDebugLevel >= 4)
+               {
+                       if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
+                       fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
+                       fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
+               }
+
+               if (CUDASync("Initialization (3)", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+               StandalonePerfTime(firstSlice + iSlice, 1);
+
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);
+               AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), HLTCA_GPU_THREAD_COUNT_FINDER, 0, cudaStreams[iSlice & 1]>>>(iSlice);
+
+               if (CUDASync("Neighbours finder", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+
+               StandalonePerfTime(firstSlice + iSlice, 2);
+
+               if (fDebugLevel >= 4)
+               {
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
+                       if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
+               }
+
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);
+               AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
+               if (CUDASync("Neighbours Cleaner", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+
+               StandalonePerfTime(firstSlice + iSlice, 3);
+
+               if (fDebugLevel >= 4)
+               {
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
+                       if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
+               }
+
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);
+               AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
+               if (CUDASync("Start Hits Finder", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+
+               StandalonePerfTime(firstSlice + iSlice, 4);
+
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);
+               AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
+               if (CUDASync("Start Hits Sorter", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+
+               StandalonePerfTime(firstSlice + iSlice, 5);
+
+               if (fDebugLevel >= 2)
+               {
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
+                       if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);
+                       if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS RANDOM_ERROR)
+                       {
+                               HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
+                               cudaThreadSynchronize();
+                               cuCtxPopCurrent((CUcontext*) fCudaContext);
+                               ResetHelperThreads(1);
+                               return(1);
+                       }
+               }
+
+               if (fDebugLevel >= 4 && *fSlaveTrackers[firstSlice + iSlice].NTracklets())
+               {
+#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
+                       if (fDebugMask & 8)
+                       {
+                               *fOutFile << "Temporary ";
+                               fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
+                       }
+                       uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
+                       CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
+                       if (fDebugMask & 16)
+                       {
+                               *fOutFile << "Start Hits Sort Vector:" << std::endl;
+                               for (int i = 1;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 5;i++)
+                               {
+                                       *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
+                               }
+                       }
+                       free(tmpMemory);
+#endif
+
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
+                       if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
+               }
+
+               StandalonePerfTime(firstSlice + iSlice, 6);
+
+               fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
+       }
+
+       for (int i = 0;i < fNHelperThreads;i++)
+       {
+               pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
+       }
+
+       StandalonePerfTime(firstSlice, 7);
+
+#ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
+       int nHardCollisions = 0;
+
+RestartTrackletConstructor:
+       if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
+               if (CUDASync("Tracklet Initializer", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       return(1);
+               }
+               if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice);
+       }
+#endif
+
+       if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
+       AliHLTTPCCATrackletConstructorGPU<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>();
+       if (CUDASync("Tracklet Constructor", 0, firstSlice) RANDOM_ERROR)
+       {
+               cudaThreadSynchronize();
+               cuCtxPopCurrent((CUcontext*) fCudaContext);
+               return(1);
+       }
+
+       StandalonePerfTime(firstSlice, 8);
+
+       if (fDebugLevel >= 4)
+       {
+               for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+               {
+                       if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
+                       if (fDebugLevel >= 5)
+                       {
+                               HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
+                       }
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
+                       if (0 && fSlaveTrackers[firstSlice + iSlice].NTracklets() && fSlaveTrackers[firstSlice + iSlice].Tracklet(0).NHits() < 0)
+                       {
+                               cudaThreadSynchronize();
+                               cuCtxPopCurrent((CUcontext*) fCudaContext);
+                               printf("INTERNAL ERROR\n");
+                               return(1);
+                       }
+                       if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
+               }
+       }
+
+       int runSlices = 0;
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
+       {
+               if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
+               if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
+               AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
+               if (CUDASync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR)
+               {
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       return(1);
+               }
+       }
+       StandalonePerfTime(firstSlice, 9);
+
+       char *tmpMemoryGlobalTracking = NULL;
+       fSliceOutputReady = 0;
+       if (fUseGlobalTracking)
+       {
+               int tmpmemSize = sizeof(AliHLTTPCCATracklet)
+#ifdef EXTERN_ROW_HITS
+               + HLTCA_ROW_COUNT * sizeof(int)
+#endif
+               + 16;
+               tmpMemoryGlobalTracking = (char*) malloc(tmpmemSize * fgkNSlices);
+               for (int i = 0;i < fgkNSlices;i++)
+               {
+                       fSliceLeftGlobalReady[i] = 0;
+                       fSliceRightGlobalReady[i] = 0;
+               }
+               memset(fGlobalTrackingDone, 0, fgkNSlices);
+               memset(fWriteOutputDone, 0, fgkNSlices);
+
+               for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
+               {
+                       fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(tmpMemoryGlobalTracking + (tmpmemSize * iSlice), 1, fConstructorBlockCount);
+               }
+       }
+       for (int i = 0;i < fNHelperThreads;i++)
+       {
+               fHelperParams[i].fPhase = 1;
+               fHelperParams[i].pOutput = pOutput;
+               pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
+       }
+
+       int tmpSlice = 0, tmpSlice2 = 0;
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
+
+               while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == (cudaError_t) CUDA_SUCCESS))
+               {
+                       if (CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice])) RANDOM_ERROR)
+                       {
+                               ResetHelperThreads(1);
+                               cudaThreadSynchronize();
+                               return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
+                       }
+                       tmpSlice++;
+               }
+
+               while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == (cudaError_t) CUDA_SUCCESS)
+               {
+                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
+                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
+                       tmpSlice2++;
+               }
+
+               if (CudaFailedMsg(cudaStreamSynchronize(cudaStreams[iSlice])) RANDOM_ERROR)
+               {
+                       ResetHelperThreads(1);
+                       cudaThreadSynchronize();
+                       return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
+               }
+
+               if (fDebugLevel >= 4)
+               {
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
+#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
+                       if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
+#endif
+                       if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
+               }
+
+               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError RANDOM_ERROR)
+               {
+#ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
+                       if ((fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION || fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_WRONG_ROW)&& nHardCollisions++ < 10)
+                       {
+                               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION)
+                               {
+                                       HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor (Slice %d)", firstSlice + iSlice);
+                               }
+                               else
+                               {
+                                       HLTWarning("Tracklet Constructor returned invalid row (Slice %d)", firstSlice + iSlice);
+                               }
+                               if (fDebugLevel >= 4)
+                               {
+                                       ResetHelperThreads(1);
+                                       cudaThreadSynchronize();
+                                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                                       return(1);
+                               }
+                               for (int i = 0;i < sliceCountLocal;i++)
+                               {
+                                       cudaThreadSynchronize();
+                                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
+                                       *fSlaveTrackers[firstSlice + i].NTracks() = 0;
+                                       *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
+                                       fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
+                                       CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
+                                       PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
+                               }
+                               goto RestartTrackletConstructor;
+                       }
+#endif
+                       HLTError("GPU Tracker returned Error Code %d in slice %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError, firstSlice + iSlice);
+                       cudaThreadSynchronize();
+                       cuCtxPopCurrent((CUcontext*) fCudaContext);
+                       ResetHelperThreads(1);
+                       return(1);
+               }
+               if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
+
+               fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks;
+               fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits;
+               if (fUseGlobalTracking) fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracklets = 1;
+
+               if (fDebugLevel >= 3) HLTInfo("Data ready for slice %d, helper thread %d", iSlice, iSlice % (fNHelperThreads + 1));
+               fSliceOutputReady = iSlice;
+
+               if (fUseGlobalTracking)
+               {
+                       if (iSlice % (fgkNSlices / 2) == 2)
+                       {
+                               int tmpId = iSlice % (fgkNSlices / 2) - 1;
+                               if (iSlice >= fgkNSlices / 2) tmpId += fgkNSlices / 2;
+                               GlobalTracking(tmpId, 0, NULL);
+                               fGlobalTrackingDone[tmpId] = 1;
+                       }
+                       for (int tmpSlice3a = 0;tmpSlice3a < iSlice;tmpSlice3a += fNHelperThreads + 1)
+                       {
+                               int tmpSlice3 = tmpSlice3a + 1;
+                               if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
+                               if (tmpSlice3 >= iSlice) break;
+
+                               int sliceLeft = (tmpSlice3 + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
+                               int sliceRight = (tmpSlice3 + 1) % (fgkNSlices / 2);
+                               if (tmpSlice3 >= fgkNSlices / 2)
+                               {
+                                       sliceLeft += fgkNSlices / 2;
+                                       sliceRight += fgkNSlices / 2;
+                               }
+
+                               if (tmpSlice3 % (fgkNSlices / 2) != 1 && fGlobalTrackingDone[tmpSlice3] == 0 && sliceLeft < iSlice && sliceRight < iSlice)
+                               {
+                                       GlobalTracking(tmpSlice3, 0, NULL);
+                                       fGlobalTrackingDone[tmpSlice3] = 1;
+                               }
+
+                               if (fWriteOutputDone[tmpSlice3] == 0 && fSliceLeftGlobalReady[tmpSlice3] && fSliceRightGlobalReady[tmpSlice3])
+                               {
+                                       WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
+                                       fWriteOutputDone[tmpSlice3] = 1;
+                               }
+                       }
+               }
+               else
+               {
+                       if (iSlice % (fNHelperThreads + 1) == 0)
+                       {
+                               WriteOutput(pOutput, firstSlice, iSlice, 0);
+                       }
+               }
+
+               if (fDebugLevel >= 4)
+               {
+                       delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
+                       delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
+               }
+       }
+
+       if (fUseGlobalTracking)
+       {
+               for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
+               {
+                       int tmpSlice3 = (tmpSlice3a + 1);
+                       if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
+                       if (fGlobalTrackingDone[tmpSlice3] == 0) GlobalTracking(tmpSlice3, 0, NULL);
+               }
+               for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
+               {
+                       int tmpSlice3 = (tmpSlice3a + 1);
+                       if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
+                       if (fWriteOutputDone[tmpSlice3] == 0)
+                       {
+                               while (fSliceLeftGlobalReady[tmpSlice3] == 0 || fSliceRightGlobalReady[tmpSlice3] == 0);
+                               WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
+                       }
+               }
+       }
+
+       for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
+       {
+               pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
+       }
+
+       if (fUseGlobalTracking)
+       {
+               free(tmpMemoryGlobalTracking);
+               if (fDebugLevel >= 3)
+               {
+                       for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
+                       {
+                               printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits);
+                       }
+               }
+       }
+
+       StandalonePerfTime(firstSlice, 10);
+
+       if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
+
+       /*for (int i = firstSlice;i < firstSlice + sliceCountLocal;i++)
+       {
+               fSlaveTrackers[i].DumpOutput(stdout);
+       }*/
+
+       /*static int runnum = 0;
+       std::ofstream tmpOut;
+       char buffer[1024];
+       sprintf(buffer, "GPUtracks%d.out", runnum++);
+       tmpOut.open(buffer);
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(tmpOut);
+       }
+       tmpOut.close();*/
+
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+       char* stageAtSync = (char*) malloc(100000000);
+       CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
+       cudaFree(fGpuTracker[0].StageAtSync());
+
+       FILE* fp = fopen("profile.txt", "w+");
+       FILE* fp2 = fopen("profile.bmp", "w+b");
+       int nEmptySync = 0, fEmpty;
+
+       const int bmpheight = 8192;
+       BITMAPFILEHEADER bmpFH;
+       BITMAPINFOHEADER bmpIH;
+       ZeroMemory(&bmpFH, sizeof(bmpFH));
+       ZeroMemory(&bmpIH, sizeof(bmpIH));
+
+       bmpFH.bfType = 19778; //"BM"
+       bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1) * bmpheight ;
+       bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
+
+       bmpIH.biSize = sizeof(bmpIH);
+       bmpIH.biWidth = fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1;
+       bmpIH.biHeight = bmpheight;
+       bmpIH.biPlanes = 1;
+       bmpIH.biBitCount = 32;
+
+       fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
+       fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);  
+
+       for (int i = 0;i < bmpheight * fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;i += fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR)
+       {
+               fEmpty = 1;
+               for (int j = 0;j < fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;j++)
+               {
+                       fprintf(fp, "%d\t", stageAtSync[i + j]);
+                       int color = 0;
+                       if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
+                       if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
+                       if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
+                       if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
+                       fwrite(&color, 1, sizeof(int), fp2);
+                       if (j > 0 && j % 32 == 0)
+                       {
+                               color = RGB(255, 255, 255);
+                               fwrite(&color, 1, 4, fp2);
+                       }
+                       if (stageAtSync[i + j]) fEmpty = 0;
+               }
+               fprintf(fp, "\n");
+               if (fEmpty) nEmptySync++;
+               else nEmptySync = 0;
+               //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
+       }
+
+       fclose(fp);
+       fclose(fp2);
+       free(stageAtSync);
+#endif 
+
+       cuCtxPopCurrent((CUcontext*) fCudaContext);
+       return(0);
+}
+
+__global__ void ClearPPHitWeights(int sliceCount)
+{
+       //Clear HitWeights
+
+       for (int k = 0;k < sliceCount;k++)
+       {
+               AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k];
+               int4* const pHitWeights = (int4*) tracker.Data().HitWeights();
+               const int dwCount = tracker.Data().NumberOfHitsPlusAlign();
+               const int stride = blockDim.x * gridDim.x;
+               int4 i0;
+               i0.x = i0.y = i0.z = i0.w = 0;
+
+               for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < dwCount * sizeof(int) / sizeof(int4);i += stride)
+               {
+                       pHitWeights[i] = i0;
+               }
+       }
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
+{
+       //Primary reconstruction function for small events (PP)
+
+       memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
+
+       if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
+
+       char* tmpSliceMemHost = (char*) SliceDataMemory(fHostLockedMemory, 0);
+       char* tmpSliceMemGpu = (char*) SliceDataMemory(fGPUMemory, 0);
+
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               StandalonePerfTime(firstSlice + iSlice, 0);
+
+               //Initialize GPU Slave Tracker
+               if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
+               fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(tmpSliceMemHost, RowMemory(fHostLockedMemory, firstSlice + iSlice));
+               fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
+               if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
+               {
+                       HLTError("Insufficiant Slice Data Memory");
+                       return(1);
+               }
+
+               //Make this a GPU Tracker
+               fGpuTracker[iSlice].SetGPUTracker();
+               fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
+
+
+               fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice));
+               fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
+
+               tmpSliceMemHost += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
+               tmpSliceMemHost = alignPointer(tmpSliceMemHost, 64 * 1024);
+               tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
+               tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024);
+
+
+               //Set Pointers to GPU Memory
+               char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
+
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
+               tmpMem = alignPointer(tmpMem, 64 * 1024);
+
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
+               tmpMem = alignPointer(tmpMem, 64 * 1024);
+
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
+               tmpMem = alignPointer(tmpMem, 64 * 1024);
+
+               if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
+               {
+                       HLTError("Insufficiant Track Memory");
+                       return(1);
+               }
+
+               if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
+               {
+                       HLTError("Insufficiant Global Memory");
+                       return(1);
+               }
+
+               //Initialize Startup Constants
+               *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
+               *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
+               *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
+               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
+
+               fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
+
+               if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);
+               StandalonePerfTime(firstSlice + iSlice, 1);
+       }
+
+#ifdef HLTCA_GPU_TEXTURE_FETCH
+       cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
+       size_t offset;
+       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
+       {
+               HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
+               return(1);
+       }
+       cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
+       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
+       {
+               HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
+               return(1);
+       }
+       cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
+       if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
+       {
+               HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
+               return(1);
+       }
+#endif
+
+       //Copy Tracker Object to GPU Memory
+       if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
+       CudaFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));
+
+       //Copy Data to GPU Global Memory
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               CudaFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));
+               //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize());
+       }
+       //CudaFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));
+       //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0)));
+       CudaFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));
+       CudaFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));
+
+       if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
+       AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_FINDER>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows());
+       if (CUDASync("Neighbours finder", 0, firstSlice)) return 1;
+       StandalonePerfTime(firstSlice, 2);
+       if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
+       AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2);
+       if (CUDASync("Neighbours Cleaner", 0, firstSlice)) return 1;
+       StandalonePerfTime(firstSlice, 3);
+       if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
+       AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6);
+       if (CUDASync("Start Hits Finder", 0, firstSlice)) return 1;
+       StandalonePerfTime(firstSlice, 4);
+
+       ClearPPHitWeights <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(sliceCountLocal);
+       if (CUDASync("Clear Hit Weights", 0, firstSlice)) return 1;
+
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
+       }
+
+       StandalonePerfTime(firstSlice, 7);
+
+       if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
+       AliHLTTPCCATrackletConstructorGPUPP<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>(0, sliceCountLocal);
+       if (CUDASync("Tracklet Constructor PP", 0, firstSlice)) return 1;
+
+       StandalonePerfTime(firstSlice, 8);
+
+       AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR>>>(0, sliceCountLocal);
+       if (CUDASync("Tracklet Selector", 0, firstSlice)) return 1;
+       StandalonePerfTime(firstSlice, 9);
+
+       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));
+
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
+
+               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));
+               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));
+
+               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
+               {
+                       HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
+                       return(1);
+               }
+               if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
+               fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks;
+               fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits;
+       }
+
+       if (fGlobalTracking && sliceCountLocal == fgkNSlices)
+       {
+               char tmpMemory[sizeof(AliHLTTPCCATracklet)
+#ifdef EXTERN_ROW_HITS
+               + HLTCA_ROW_COUNT * sizeof(int)
+#endif
+               + 16];
+
+               for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+               {
+                       if (fSlaveTrackers[iSlice].CommonMemory()->fNTracklets)
+                       {
+                               HLTError("Slave tracker tracklets found where none expected, memory not freed!\n");
+                       }
+                       fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(&tmpMemory[0], 1, fConstructorBlockCount);
+                       fSlaveTrackers[iSlice].CommonMemory()->fNTracklets = 1;
+               }
+
+               for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+               {
+                       int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
+                       int sliceRight = (iSlice + 1) % (fgkNSlices / 2);
+                       if (iSlice >= fgkNSlices / 2)
+                       {
+                               sliceLeft += fgkNSlices / 2;
+                               sliceRight += fgkNSlices / 2;
+                       }
+                       fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS);              
+               }
+
+               for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+               {
+                       printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits);
+               }
+       }
+
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
+               fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare();
+               fSlaveTrackers[firstSlice + iSlice].WriteOutput();
+       }
+
+       StandalonePerfTime(firstSlice, 10);
+
+       if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
+
+       return(0);
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
+{
+       //Initialize Slice Tracker Parameter for a slave tracker
+       fSlaveTrackers[iSlice].Initialize(param);
+       if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
+       {
+               HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
+               return(1);
+       }
+       return(0);
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
+{
+       //Uninitialize CUDA
+       cuCtxPushCurrent(*((CUcontext*) fCudaContext));
+
+       cudaThreadSynchronize();
+       if (fGPUMemory)
+       {
+               cudaFree(fGPUMemory);
+               fGPUMemory = NULL;
+       }
+       if (fHostLockedMemory)
+       {
+               for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
+               {
+                       cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
+               }
+               free(fpCudaStreams);
+               fGpuTracker = NULL;
+               cudaFreeHost(fHostLockedMemory);
+       }
+
+       if (CudaFailedMsg(cudaThreadExit()))
+       {
+               HLTError("Could not uninitialize GPU");
+               return(1);
+       }
+
+       if (StopHelperThreads()) return(1);
+       pthread_mutex_destroy((pthread_mutex_t*) fHelperMemMutex);
+       free(fHelperMemMutex);
+
+       for (int i = 0;i < fgkNSlices;i++) pthread_mutex_destroy(&((pthread_mutex_t*) fSliceGlobalMutexes)[i]);
+       free(fSliceGlobalMutexes);
+
+       cuCtxDestroy(*((CUcontext*) fCudaContext));
+
+       cudaDeviceReset();
+
+       HLTInfo("CUDA Uninitialized");
+       fCudaInitialized = 0;
+       return(0);
+}
+
+void AliHLTTPCCAGPUTrackerNVCC::ResetHelperThreads(int helpers)
+{
+       HLTImportant("Error occurred, GPU tracker helper threads will be reset (Number of threads %d/%d)", fNHelperThreads, fNCPUTrackers);
+       for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
+       {
+               fHelperParams[i].fReset = true;
+               if (helpers || i >= fNHelperThreads) pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
+       }
+       HLTImportant("GPU Tracker helper threads have ben reset");
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::StopHelperThreads()
+{
+       if (fNSlaveThreads)
+       {
+               for (int i = 0;i < fNSlaveThreads;i++)
+               {
+                       fHelperParams[i].fTerminate = true;
+                       if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]))
+                       {
+                               HLTError("Error unlocking mutex to terminate slave");
+                               return(1);
+                       }
+                       if (pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]))
+                       {
+                               HLTError("Error locking mutex");
+                               return(1);
+                       }
+                       if (pthread_join( *((pthread_t*) fHelperParams[i].fThreadId), NULL))
+                       {
+                               HLTError("Error waiting for thread to terminate");
+                               return(1);
+                       }
+                       free(fHelperParams[i].fThreadId);
+                       for (int j = 0;j < 2;j++)
+                       {
+                               if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]))
+                               {
+                                       HLTError("Error unlocking mutex before destroying");
+                                       return(1);
+                               }
+                               pthread_mutex_destroy(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
+                       }
+                       free(fHelperParams[i].fMutex);
+               }
+               delete[] fHelperParams;
+       }
+       fNSlaveThreads = 0;
+       return(0);
+}
+
+void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
+{
+       //Set Output Control Pointers
+       fOutputControl = val;
+       for (int i = 0;i < fgkNSlices;i++)
+       {
+               fSlaveTrackers[i].SetOutputControl(val);
+       }
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::GetThread()
+{
+       //Get Thread ID
+#ifdef R__WIN32
+       return((int) (size_t) GetCurrentThread());
+#else
+       return((int) syscall (SYS_gettid));
+#endif
+}
+
+unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)
+{
+       //Returns pointer to PerfTimer i of slice iSlice
+       return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);
+}
+
+const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const
+{
+       //Return Pointer to Output Control Structure
+       return fOutputControl;
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const
+{
+       //Return max slice count processable
+       return(fSliceCount);
+}
+
+char* AliHLTTPCCAGPUTrackerNVCC::MergerBaseMemory()
+{
+       return(alignPointer((char*) fGPUMergerHostMemory, 1024 * 1024));
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger)
+{
+#ifndef HLTCA_GPU_MERGER
+       HLTError("HLTCA_GPU_MERGER compile flag not set");
+       return(1);
+#else
+       if (!fCudaInitialized)
+       {
+               HLTError("GPU Merger not initialized");
+               return(1);
+       }
+       unsigned long long int a, b, c, d, e;
+       AliHLTTPCCATracker::StandaloneQueryFreq(&e);
+
+       char* gpumem = (char*) fGPUMergerMemory;
+       float *X, *Y, *Z, *Angle;
+       unsigned int *RowType;
+       AliHLTTPCGMMergedTrack* tracks;
+       float* field;
+       AliHLTTPCCAParam* param;
+
+       gpumem = alignPointer(gpumem, 1024 * 1024);
+
+       AssignMemory(X, gpumem, Merger->NClusters());
+       AssignMemory(Y, gpumem, Merger->NClusters());
+       AssignMemory(Z, gpumem, Merger->NClusters());
+       AssignMemory(Angle, gpumem, Merger->NClusters());
+       AssignMemory(RowType, gpumem, Merger->NClusters());
+       AssignMemory(tracks, gpumem, Merger->NOutputTracks());
+       AssignMemory(field, gpumem, 6);
+       AssignMemory(param, gpumem, 1);
+
+
+       if ((size_t) (gpumem - (char*) fGPUMergerMemory) > (size_t) fGPUMergerMaxMemory)
+       {
+               HLTError("Insufficiant GPU Merger Memory");
+       }
+
+       cuCtxPushCurrent(*((CUcontext*) fCudaContext));
+
+       if (fDebugLevel >= 2) HLTInfo("Running GPU Merger (%d/%d)", Merger->NOutputTrackClusters(), Merger->NClusters());
+       AliHLTTPCCATracker::StandaloneQueryTime(&a);
+       CudaFailedMsg(cudaMemcpy(X, Merger->ClusterX(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
+       CudaFailedMsg(cudaMemcpy(Y, Merger->ClusterY(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
+       CudaFailedMsg(cudaMemcpy(Z, Merger->ClusterZ(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
+       CudaFailedMsg(cudaMemcpy(Angle, Merger->ClusterAngle(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
+       CudaFailedMsg(cudaMemcpy(RowType, Merger->ClusterRowType(), Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyHostToDevice));
+       CudaFailedMsg(cudaMemcpy(tracks, Merger->OutputTracks(), Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyHostToDevice));
+       CudaFailedMsg(cudaMemcpy(field, Merger->PolinomialFieldBz(), 6 * sizeof(float), cudaMemcpyHostToDevice));
+       CudaFailedMsg(cudaMemcpy(param, fSlaveTrackers[0].pParam(), sizeof(AliHLTTPCCAParam), cudaMemcpyHostToDevice));
+       AliHLTTPCCATracker::StandaloneQueryTime(&b);
+       RefitTracks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(tracks, Merger->NOutputTracks(), field, X, Y, Z, RowType, Angle, param);
+       CudaFailedMsg(cudaThreadSynchronize());
+       AliHLTTPCCATracker::StandaloneQueryTime(&c);
+       CudaFailedMsg(cudaMemcpy(Merger->ClusterX(), X, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
+       CudaFailedMsg(cudaMemcpy(Merger->ClusterY(), Y, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
+       CudaFailedMsg(cudaMemcpy(Merger->ClusterZ(), Z, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
+       CudaFailedMsg(cudaMemcpy(Merger->ClusterAngle(), Angle, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
+       CudaFailedMsg(cudaMemcpy(Merger->ClusterRowType(), RowType, Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyDeviceToHost));
+       CudaFailedMsg(cudaMemcpy((void*) Merger->OutputTracks(), tracks, Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyDeviceToHost));
+       CudaFailedMsg(cudaThreadSynchronize());
+       AliHLTTPCCATracker::StandaloneQueryTime(&d);
+       if (fDebugLevel >= 2) HLTInfo("GPU Merger Finished");
+
+       if (fDebugLevel > 0)
+       {
+               int copysize = 4 * Merger->NOutputTrackClusters() * sizeof(float) + Merger->NOutputTrackClusters() * sizeof(unsigned int) + Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack) + 6 * sizeof(float) + sizeof(AliHLTTPCCAParam);
+               double speed = (double) copysize * (double) e / (double) (b - a) / 1e9;
+               printf("GPU Fit:\tCopy To:\t%lld us (%lf GB/s)\n", (b - a) * 1000000 / e, speed);
+               printf("\t\tFit:\t%lld us\n", (c - b) * 1000000 / e);
+               speed = (double) copysize * (double) e / (double) (d - c) / 1e9;
+               printf("\t\tCopy From:\t%lld us (%lf GB/s)\n", (d - c) * 1000000 / e, speed);
+       }
+
+       cuCtxPopCurrent((CUcontext*) fCudaContext);
+       return(0);
+#endif
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::IsInitialized()
+{
+       return(fCudaInitialized);
+}
+
+AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
+{
+       return new AliHLTTPCCAGPUTrackerNVCC;
+}
+
+void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)
+{
+       delete ptr;
+}
+
diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch
new file mode 100755 (executable)
index 0000000..3202e11
--- /dev/null
@@ -0,0 +1,122 @@
+--- AliHLTTPCCAGPUTracker.cucpp        2009-05-28 12:14:09.000000000 +0200 
++++ release/x86_64-pc-linux-gnu/code/AliHLTTPCCAGPUTracker.cucpp 2009-05-28 12:10:25.000000000 +0200
+@@ -1530,10 +1530,10 @@
+ extern "C" { extern int getdate_err; }
+ extern "C" tm *getdate(const char *);
+ extern "C" int getdate_r(const char *__restrict__, tm *__restrict__);
+-extern "C" { extern inline __attribute__((__weak__)) void *memcpy(void *__restrict__, const void *__restrict__, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); }
++extern "C" { extern inline void *memcpy(void *__restrict__, const void *__restrict__, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); }
+ extern "C" { extern inline void *memmove(void *, const void *, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); }
+ extern "C" void *memccpy(void *__restrict__, const void *__restrict__, int, size_t) throw() __attribute__((nonnull(1))) __attribute__((nonnull(2)));
+-extern "C" { extern inline __attribute__((__weak__)) void *memset(void *, int, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))); }
++extern "C" { extern inline void *memset(void *, int, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))); }
+ extern "C" int memcmp(const void *, const void *, size_t) throw() __attribute__((__pure__)) __attribute__((nonnull(1))) __attribute__((nonnull(2)));
+ extern inline void *memchr(void *, int, size_t) throw() __asm__("memchr") __attribute__((__pure__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1)));
+ extern inline const void *memchr(const void *, int, size_t) throw() __asm__("memchr") __attribute__((__pure__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1)));
+@@ -1661,7 +1661,7 @@
+ char *basename(char *) throw() __asm__("basename") __attribute__((nonnull(1)));
+ const char *basename(const char *) throw() __asm__("basename") __attribute__((nonnull(1)));
+ extern "C" void __warn_memset_zero_len();
+-extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))) void *memcpy(void *__restrict__ __dest, const void *__restrict__ __src, size_t __len) throw()
++extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))) void *memcpy(void *__restrict__ __dest, const void *__restrict__ __src, size_t __len) throw()
+ {
+ return __builtin___memcpy_chk(__dest, __src, __len, __builtin_object_size(__dest, 0));
+ } }
+@@ -1673,7 +1673,7 @@
+ {
+ return __builtin___mempcpy_chk(__dest, __src, __len, __builtin_object_size(__dest, 0));
+ } }
+-extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) void *memset(void *__dest, int __ch, size_t __len) throw()
++extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) void *memset(void *__dest, int __ch, size_t __len) throw()
+ {
+ if (((0) && (__len == (0))) && ((!(0)) || (__ch != 0)))
+ {
+@@ -1719,8 +1719,6 @@
+ return __builtin___strncat_chk(__dest, __src, __len, __builtin_object_size(__dest, 2 > 1));
+ } }
+ extern "C" __attribute__((__weak__)) clock_t clock() throw();
+-extern "C" { extern inline __attribute__((__weak__)) void *memset(void *, int, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))); }
+-extern "C" { extern inline __attribute__((__weak__)) void *memcpy(void *, const void *, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); }
+ extern "C" __attribute__((__weak__)) int abs(int) throw() __attribute__((__warn_unused_result__)) __attribute__((__const__));
+ extern "C" __attribute__((__weak__)) long labs(long) throw() __attribute__((__warn_unused_result__)) __attribute__((__const__));
+ extern "C" __attribute__((__weak__)) long long llabs(long long) throw() __attribute__((__warn_unused_result__)) __attribute__((__const__));
+@@ -1862,11 +1860,8 @@
+ extern "C" __attribute__((__weak__)) int __isnanf(float) throw() __attribute__((__const__));
+ extern "C" __attribute__((__weak__)) int __finite(double) throw() __attribute__((__const__));
+ extern "C" __attribute__((__weak__)) int __finitef(float) throw() __attribute__((__const__));
+-extern "C" { extern inline __attribute__((__weak__)) int __signbit(double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
+-extern "C" { extern inline __attribute__((__weak__)) int __signbitf(float) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
+ extern "C" __attribute__((__weak__)) double fma(double, double, double) throw();
+ extern "C" __attribute__((__weak__)) float fmaf(float, float, float) throw();
+-extern "C" { extern inline __attribute__((__weak__)) int __signbitl(long double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
+ extern "C" __attribute__((__weak__)) int __isinfl(long double) throw() __attribute__((__const__));
+ extern "C" __attribute__((__weak__)) int __isnanl(long double) throw() __attribute__((__const__));
+ extern "C" __attribute__((__weak__)) int __finitel(long double) throw() __attribute__((__const__));
+@@ -1948,7 +1943,7 @@
+ extern "C" __attribute__((__weak__)) double fmax(double, double) throw(); extern "C" double __fmax(double, double) throw();
+ extern "C" __attribute__((__weak__)) double fmin(double, double) throw(); extern "C" double __fmin(double, double) throw();
+ extern "C" int __fpclassify(double) throw() __attribute__((__const__));
+-extern "C" { extern inline __attribute__((__weak__)) int __signbit(double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
++extern "C" { extern inline int __signbit(double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
+ extern "C" __attribute__((__weak__)) double fma(double, double, double) throw(); extern "C" double __fma(double, double, double) throw();
+ extern "C" double scalb(double, double) throw(); extern "C" double __scalb(double, double) throw();
+ extern "C" __attribute__((__weak__)) float acosf(float) throw(); extern "C" float __acosf(float) throw();
+@@ -2027,7 +2022,7 @@
+ extern "C" __attribute__((__weak__)) float fmaxf(float, float) throw(); extern "C" float __fmaxf(float, float) throw();
+ extern "C" __attribute__((__weak__)) float fminf(float, float) throw(); extern "C" float __fminf(float, float) throw();
+ extern "C" int __fpclassifyf(float) throw() __attribute__((__const__));
+-extern "C" { extern inline __attribute__((__weak__)) int __signbitf(float) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
++extern "C" { extern inline int __signbitf(float) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
+ extern "C" __attribute__((__weak__)) float fmaf(float, float, float) throw(); extern "C" float __fmaf(float, float, float) throw();
+ extern "C" float scalbf(float, float) throw(); extern "C" float __scalbf(float, float) throw();
+ extern "C" long double acosl(long double) throw(); extern "C" long double __acosl(long double) throw();
+@@ -2106,7 +2101,7 @@
+ extern "C" long double fmaxl(long double, long double) throw(); extern "C" long double __fmaxl(long double, long double) throw();
+ extern "C" long double fminl(long double, long double) throw(); extern "C" long double __fminl(long double, long double) throw();
+ extern "C" int __fpclassifyl(long double) throw() __attribute__((__const__));
+-extern "C" { extern inline __attribute__((__weak__)) int __signbitl(long double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
++extern "C" { extern inline int __signbitl(long double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); }
+ extern "C" long double fmal(long double, long double, long double) throw(); extern "C" long double __fmal(long double, long double, long double) throw();
+ extern "C" long double scalbl(long double, long double) throw(); extern "C" long double __scalbl(long double, long double) throw();
+ extern "C" { extern int signgam; }
+@@ -2134,19 +2129,19 @@
+ double retval;
+ }; }
+ extern "C" int matherr(__exception *) throw();
+-extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitf(float __x) throw()
++extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitf(float __x) throw()
+ {
+ int __m;
+ __asm__("pmovmskb %1, %0" : "=r" (__m) : "x" (__x));
+ return __m & 8;
+ } }
+-extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbit(double __x) throw()
++extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbit(double __x) throw()
+ {
+ int __m;
+ __asm__("pmovmskb %1, %0" : "=r" (__m) : "x" (__x));
+ return __m & 128;
+ } }
+-extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitl(long double __x) throw()
++extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitl(long double __x) throw()
+ {
+ union { long double __l; int __i[3]; } __u = {__l: __x};
+ return (((__u.__i)[2]) & 32768) != 0;
+@@ -9864,7 +9859,7 @@
+ {
+ __c_locale __old = __gnu_cxx::__uselocale(__cloc);
+ __builtin_va_list __args;
+-__builtin_stdarg_start(__args,__fmt);
++__builtin_va_start(__args,__fmt);
+ const int __ret = __builtin_vsnprintf(__out, __size, __fmt, __args);
+ __builtin_va_end(__args);
+ __gnu_cxx::__uselocale(__old);
+@@ -23186,7 +23186,7 @@
+ static T2 *Alloc(int s) { auto T2 *p = (reinterpret_cast< T2 *>(_mm_malloc(s * sizeof(CacheLineSizeHelper< T> ), 128))); return new (p) T2 [s]; }
+ static void Free(T2 *const p, int size) {
+ for (int i = 0; i < size; ++i) {
+-((p[i]).~CacheLineSizeHelper());
++((p[i]).~T2());
+ }
+ _mm_free(p);
+ } 
diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.h
new file mode 100755 (executable)
index 0000000..11e7d84
--- /dev/null
@@ -0,0 +1,191 @@
+//-*- Mode: C++ -*-
+// $Id$
+
+// ************************************************************************
+// This file is property of and copyright by the ALICE HLT Project        *
+// ALICE Experiment at CERN, All rights reserved.                         *
+// See cxx source for full Copyright notice                               *
+//                                                                        *
+//*************************************************************************
+
+//  @file   AliHLTTPCCAGPUTrackerNVCC.h
+//  @author David Rohr, Sergey Gorbunov
+//  @date   
+//  @brief  TPC CA Tracker for the NVIDIA GPU
+//  @note 
+
+
+#ifndef ALIHLTTPCCAGPUTRACKERNVCC_H
+#define ALIHLTTPCCAGPUTRACKERNVCC_H
+
+#include "AliHLTTPCCAGPUTracker.h"
+#include "AliHLTTPCCADef.h"
+#include "AliHLTTPCCATracker.h"
+#include "AliHLTLogging.h"
+#include "AliHLTTPCCASliceOutput.h"
+
+#ifdef __CINT__
+typedef int cudaError_t
+#elif defined(R__WIN32)
+#include "../cmodules/pthread_mutex_win32_wrapper.h"
+#else
+#include <pthread.h>
+#include <errno.h>
+#endif
+
+class AliHLTTPCCARow;
+
+class AliHLTTPCCAGPUTrackerNVCC : public AliHLTTPCCAGPUTracker, public AliHLTLogging
+{
+       friend void* helperWrapper(void*);
+public:
+       AliHLTTPCCAGPUTrackerNVCC();
+       virtual ~AliHLTTPCCAGPUTrackerNVCC();
+
+       virtual int InitGPU(int sliceCount = -1, int forceDeviceID = -1);
+       virtual int IsInitialized();
+       virtual int Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
+       int ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
+       int SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
+       virtual int ExitGPU();
+
+       virtual void SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile = NULL);
+       virtual int SetGPUTrackerOption(char* OptionName, int OptionValue);
+
+       virtual unsigned long long int* PerfTimer(int iSlice, unsigned int i);
+
+       virtual int InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param);
+       virtual void SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val);
+
+       virtual const AliHLTTPCCASliceOutput::outputControlStruct* OutputControl() const;
+       virtual int GetSliceCount() const;
+
+       virtual int RefitMergedTracks(AliHLTTPCGMMerger* Merger);
+       virtual char* MergerBaseMemory();
+
+private:
+       struct helperParam
+       {
+               void* fThreadId;
+               AliHLTTPCCAGPUTrackerNVCC* fCls;
+               int fNum;
+               int fSliceCount;
+               AliHLTTPCCAClusterData* pClusterData;
+               AliHLTTPCCASliceOutput** pOutput;
+               int fFirstSlice;
+               void* fMutex;
+               bool fTerminate;
+               int fPhase;
+               int CPUTracker;
+               volatile int fDone;
+               volatile bool fReset;
+       };
+
+       static void* RowMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + iSlice * sizeof(AliHLTTPCCARow) * (HLTCA_ROW_COUNT + 1) ); }
+       static void* CommonMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + iSlice * AliHLTTPCCATracker::CommonMemorySize() ); }
+       static void* SliceDataMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + iSlice * HLTCA_GPU_SLICE_DATA_MEMORY ); }
+       void* GlobalMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_GLOBAL_MEMORY ); }
+       void* TracksMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_TRACKS_MEMORY ); }
+       void* TrackerMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + iSlice * sizeof(AliHLTTPCCATracker) ); }
+       
+       void ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId);
+       void WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId);
+       int GlobalTracking(int iSlice, int threadId, helperParam* hParam);
+
+       int StartHelperThreads();
+       int StopHelperThreads();
+       void ResetHelperThreads(int helpers);
+       void ResetThisHelperThread(AliHLTTPCCAGPUTrackerNVCC::helperParam* par);
+
+       void DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check = true);
+       int GetThread();
+       void ReleaseGlobalLock(void* sem);
+       int CheckMemorySizes(int sliceCount);
+
+       int CUDASync(char* state = "UNKNOWN", int sliceLocal = 0, int slice = 0);
+       template <class T> T* alignPointer(T* ptr, int alignment);
+       void StandalonePerfTime(int iSlice, int i);
+#define CudaFailedMsg(x) CudaFailedMsgA(x, __FILE__, __LINE__)
+       bool CudaFailedMsgA(cudaError_t error, const char* file, int line);
+       
+       static void* helperWrapper(void*);
+
+       AliHLTTPCCATracker *fGpuTracker; //Tracker Objects that will be used on the GPU
+       void* fGPUMemory; //Pointer to GPU Memory Base Adress
+       void* fHostLockedMemory; //Pointer to Base Adress of Page Locked Host Memory for DMA Transfer
+
+       void* fGPUMergerMemory;
+       void* fGPUMergerHostMemory;
+       int fGPUMergerMaxMemory;
+
+       int fDebugLevel;                        //Debug Level for GPU Tracker
+       unsigned int fDebugMask;        //Mask which Debug Data is written to file
+       std::ostream* fOutFile;         //Debug Output Stream Pointer
+       unsigned long long int fGPUMemSize;     //Memory Size to allocate on GPU
+
+       void* fpCudaStreams; //Pointer to array of CUDA Streams
+       int fSliceCount; //Maximum Number of Slices this GPU tracker can process in parallel
+       int fCudaDevice; //CUDA device used by GPU tracker
+
+       static const int fgkNSlices = 36; //Number of Slices in Alice
+       AliHLTTPCCATracker fSlaveTrackers[fgkNSlices]; //CPU Slave Trackers for Initialization and Output
+
+       AliHLTTPCCASliceOutput::outputControlStruct* fOutputControl; //Output Control Structure
+       
+       int fThreadId; //Thread ID that is valid for the local CUDA context
+       int fCudaInitialized; //Flag if CUDA is initialized
+
+       int fPPMode; //Flag if GPU tracker runs in PP Mode
+       int fSelfheal; //Reinitialize GPU on failure
+
+       int fConstructorBlockCount; //GPU blocks used in Tracklet Constructor
+       int selectorBlockCount; //GPU blocks used in Tracklet Selector
+       
+#ifdef HLTCA_GPU_TIME_PROFILE
+       unsigned long long int fProfTimeC, fProfTimeD; //Timing
+#endif
+
+       void* fCudaContext; //Pointer to CUDA context
+       
+       int fNHelperThreads; //Number of helper threads for post/preprocessing
+       helperParam* fHelperParams; //Control Struct for helper threads
+       void* fHelperMemMutex;
+       
+#ifdef __ROOT__
+#define volatile
+#endif
+       volatile int fSliceOutputReady;
+       volatile char fSliceLeftGlobalReady[fgkNSlices];
+       volatile char fSliceRightGlobalReady[fgkNSlices];
+#ifdef __ROOT__
+#undef volatile
+#endif
+       void* fSliceGlobalMutexes;
+       char fGlobalTrackingDone[fgkNSlices];
+       char fWriteOutputDone[fgkNSlices];
+
+       int fNCPUTrackers; //Number of CPU trackers to use
+       int fNSlicesPerCPUTracker; //Number of slices processed by each CPU tracker
+
+       int fGlobalTracking; //Use Global Tracking
+       int fUseGlobalTracking; 
+
+       int fNSlaveThreads;     //Number of slave threads currently active
+
+       // disable copy
+       AliHLTTPCCAGPUTrackerNVCC( const AliHLTTPCCAGPUTrackerNVCC& );
+       AliHLTTPCCAGPUTrackerNVCC &operator=( const AliHLTTPCCAGPUTrackerNVCC& );
+
+       ClassDef( AliHLTTPCCAGPUTrackerNVCC, 0 )
+};
+
+#ifdef R__WIN32
+#define DLL_EXPORT __declspec(dllexport)
+#else
+#define DLL_EXPORT
+#endif
+
+extern "C" DLL_EXPORT AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate();
+extern "C" DLL_EXPORT void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr);
+
+#endif //ALIHLTTPCCAGPUTRACKER_H
diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCATrackletConstructorGPU.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCATrackletConstructorGPU.h
new file mode 100755 (executable)
index 0000000..ec0a550
--- /dev/null
@@ -0,0 +1,868 @@
+#include "AliHLTTPCCAGPUConfig.h"
+
+GPUdi() void AliHLTTPCCATrackletConstructor::CopyTrackletTempData( AliHLTTPCCAThreadMemory &rMemSrc, AliHLTTPCCAThreadMemory &rMemDst, AliHLTTPCCATrackParam &tParamSrc, AliHLTTPCCATrackParam &tParamDst)
+{
+       //Copy Temporary Tracklet data from registers to global mem and vice versa
+       rMemDst.fStartRow = rMemSrc.fStartRow;
+       rMemDst.fEndRow = rMemSrc.fEndRow;
+       rMemDst.fFirstRow = rMemSrc.fFirstRow;
+       rMemDst.fLastRow = rMemSrc.fLastRow;
+       rMemDst.fCurrIH =  rMemSrc.fCurrIH;
+       rMemDst.fGo = rMemSrc.fGo;
+       rMemDst.fStage = rMemSrc.fStage;
+       rMemDst.fNHits = rMemSrc.fNHits;
+       rMemDst.fNMissed = rMemSrc.fNMissed;
+       rMemDst.fLastY = rMemSrc.fLastY;
+       rMemDst.fLastZ = rMemSrc.fLastZ;
+
+#if defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER) & !defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER_SIMPLE)
+       rMemDst.fItr = rMemSrc.fItr;
+       rMemDst.fIRow = rMemSrc.fIRow;
+       rMemDst.fIRowEnd = rMemSrc.fIRowEnd;
+#endif
+
+       tParamDst.SetSinPhi( tParamSrc.GetSinPhi() );
+       tParamDst.SetDzDs( tParamSrc.GetDzDs() );
+       tParamDst.SetQPt( tParamSrc.GetQPt() );
+       tParamDst.SetSignCosPhi( tParamSrc.GetSignCosPhi() );
+       tParamDst.SetChi2( tParamSrc.GetChi2() );
+       tParamDst.SetNDF( tParamSrc.GetNDF() );
+       tParamDst.SetCov( 0, tParamSrc.GetCov(0) );
+       tParamDst.SetCov( 1, tParamSrc.GetCov(1) );
+       tParamDst.SetCov( 2, tParamSrc.GetCov(2) );
+       tParamDst.SetCov( 3, tParamSrc.GetCov(3) );
+       tParamDst.SetCov( 4, tParamSrc.GetCov(4) );
+       tParamDst.SetCov( 5, tParamSrc.GetCov(5) );
+       tParamDst.SetCov( 6, tParamSrc.GetCov(6) );
+       tParamDst.SetCov( 7, tParamSrc.GetCov(7) );
+       tParamDst.SetCov( 8, tParamSrc.GetCov(8) );
+       tParamDst.SetCov( 9, tParamSrc.GetCov(9) );
+       tParamDst.SetCov( 10, tParamSrc.GetCov(10) );
+       tParamDst.SetCov( 11, tParamSrc.GetCov(11) );
+       tParamDst.SetCov( 12, tParamSrc.GetCov(12) );
+       tParamDst.SetCov( 13, tParamSrc.GetCov(13) );
+       tParamDst.SetCov( 14, tParamSrc.GetCov(14) );
+       tParamDst.SetX( tParamSrc.GetX() );
+       tParamDst.SetY( tParamSrc.GetY() );
+       tParamDst.SetZ( tParamSrc.GetZ() );
+}
+
+#ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
+GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, int Reverse, int RowBlock, int &mustInit)
+{
+       //Fetch a new trackled to be processed by this thread
+       __syncthreads();
+       int nextTrackletFirstRun = sMem.fNextTrackletFirstRun;
+       if (threadIdx.x == 0)
+       {
+               sMem.fNTracklets = *tracker.NTracklets();
+               if (sMem.fNextTrackletFirstRun)
+               {
+#ifdef HLTCA_GPU_SCHED_FIXED_START
+                       const int iSlice = tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + (gridDim.x % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % gridDim.x != 0)) / gridDim.x;
+                       const int nSliceBlockOffset = gridDim.x * iSlice / tracker.GPUParametersConst()->fGPUnSlices;
+                       const uint2 &nTracklet = tracker.BlockStartingTracklet()[blockIdx.x - nSliceBlockOffset];
+
+                       sMem.fNextTrackletCount = nTracklet.y;
+                       if (sMem.fNextTrackletCount == 0)
+                       {
+                               sMem.fNextTrackletFirstRun = 0;
+                       }
+                       else
+                       {
+                               if (tracker.TrackletStartHits()[nTracklet.x].RowIndex() / HLTCA_GPU_SCHED_ROW_STEP != RowBlock)
+                               {
+                                       sMem.fNextTrackletCount = 0;
+                               }
+                               else
+                               {
+                                       sMem.fNextTrackletFirst = nTracklet.x;
+                               }
+                       }
+#endif //HLTCA_GPU_SCHED_FIXED_START
+               }
+               else
+               {
+                       const int4 oldPos = *tracker.RowBlockPos(Reverse, RowBlock);
+                       const int nFetchTracks = CAMath::Max(CAMath::Min(oldPos.x - oldPos.y, HLTCA_GPU_THREAD_COUNT), 0);
+                       sMem.fNextTrackletCount = nFetchTracks;
+                       const int nUseTrack = nFetchTracks ? CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).y, nFetchTracks) : 0;
+                       sMem.fNextTrackletFirst = nUseTrack;
+
+                       const int nFillTracks = CAMath::Min(nFetchTracks, nUseTrack + nFetchTracks - (*((volatile int2*) (tracker.RowBlockPos(Reverse, RowBlock)))).x);
+                       if (nFillTracks > 0)
+                       {
+                               const int nStartFillTrack = CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).x, nFillTracks);
+                               if (nFillTracks + nStartFillTrack >= HLTCA_GPU_MAX_TRACKLETS)
+                               {
+                                       tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_ROWBLOCK_TRACKLET_OVERFLOW;
+                               }
+                               for (int i = 0;i < nFillTracks;i++)
+                               {
+                                       tracker.RowBlockTracklets(Reverse, RowBlock)[(nStartFillTrack + i) % HLTCA_GPU_MAX_TRACKLETS] = -(blockIdx.x * 1000000 + nFetchTracks * 10000 + oldPos.x * 100 + oldPos.y);     //Dummy filling track
+                               }
+                       }
+               }
+       }
+       __syncthreads();
+       mustInit = 0;
+       if (sMem.fNextTrackletCount == 0)
+       {
+               return(-2);             //No more track in this RowBlock
+       }
+       else if (threadIdx.x >= sMem.fNextTrackletCount)
+       {
+               return(-1);             //No track in this RowBlock for this thread
+       }
+       else if (nextTrackletFirstRun)
+       {
+               if (threadIdx.x == 0) sMem.fNextTrackletFirstRun = 0;
+               mustInit = 1;
+               return(sMem.fNextTrackletFirst + threadIdx.x);
+       }
+       else
+       {
+               const int nTrackPos = sMem.fNextTrackletFirst + threadIdx.x;
+               mustInit = (nTrackPos < tracker.RowBlockPos(Reverse, RowBlock)->w);
+               volatile int* const ptrTracklet = &tracker.RowBlockTracklets(Reverse, RowBlock)[nTrackPos % HLTCA_GPU_MAX_TRACKLETS];
+               int nTracklet;
+               int nTryCount = 0;
+               while ((nTracklet = *ptrTracklet) == -1)
+               {
+                       for (int i = 0;i < 20000;i++)
+                               sMem.fNextTrackletStupidDummy++;
+                       nTryCount++;
+                       if (nTryCount > 30)
+                       {
+                               tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_SCHEDULE_COLLISION;
+                               return(-1);
+                       }
+               };
+               return(nTracklet);
+       }
+}
+
+GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(AliHLTTPCCATracker *pTracker)
+{
+       //Main Tracklet construction function that calls the scheduled (FetchTracklet) and then Processes the tracklet (mainly UpdataTracklet) and at the end stores the tracklet.
+       //Can also dispatch a tracklet to be rescheduled
+#ifdef HLTCA_GPU_EMULATION_SINGLE_TRACKLET
+       pTracker[0].BlockStartingTracklet()[0].x = HLTCA_GPU_EMULATION_SINGLE_TRACKLET;
+       pTracker[0].BlockStartingTracklet()[0].y = 1;
+       for (int i = 1;i < gridDim.x;i++)
+       {
+               pTracker[0].BlockStartingTracklet()[i].x = pTracker[0].BlockStartingTracklet()[i].y = 0;
+       }
+#endif //HLTCA_GPU_EMULATION_SINGLE_TRACKLET
+
+       GPUshared() AliHLTTPCCASharedMemory sMem;
+
+#ifdef HLTCA_GPU_SCHED_FIXED_START
+       if (threadIdx.x == 0)
+       {
+               sMem.fNextTrackletFirstRun = 1;
+       }
+       __syncthreads();
+#endif //HLTCA_GPU_SCHED_FIXED_START
+
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+       if (threadIdx.x == 0)
+       {
+               sMem.fMaxSync = 0;
+       }
+       int threadSync = 0;
+#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+
+       for (int iReverse = 0;iReverse < 2;iReverse++)
+       {
+               for (volatile int iRowBlock = 0;iRowBlock < HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1;iRowBlock++)
+               {
+#ifdef HLTCA_GPU_SCHED_FIXED_SLICE
+                       int iSlice = pTracker[0].GPUParametersConst()->fGPUnSlices * (blockIdx.x + (gridDim.x % pTracker[0].GPUParametersConst()->fGPUnSlices != 0 && pTracker[0].GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % gridDim.x != 0)) / gridDim.x;
+#else
+                       for (int iSlice = 0;iSlice < pTracker[0].GPUParametersConst()->fGPUnSlices;iSlice++)
+#endif //HLTCA_GPU_SCHED_FIXED_SLICE
+                       {
+                               AliHLTTPCCATracker &tracker = pTracker[iSlice];
+                               if (blockIdx.x != 7 && sMem.fNextTrackletFirstRun && iSlice != (tracker.GPUParametersConst()->fGPUnSlices > gridDim.x ? blockIdx.x : (tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + (gridDim.x % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % gridDim.x != 0)) / gridDim.x)))
+                               {
+                                       continue;
+                               }
+
+                               int sharedRowsInitialized = 0;
+
+                               int iTracklet;
+                               int mustInit;
+                               while ((iTracklet = FetchTracklet(tracker, sMem, iReverse, iRowBlock, mustInit)) != -2)
+                               {
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                       CAMath::AtomicMax(&sMem.fMaxSync, threadSync);
+                                       __syncthreads();
+                                       threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / blockDim.x / gridDim.x);
+#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                       if (!sharedRowsInitialized)
+                                       {
+                                               for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
+                                               {
+                                                       reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker.SliceDataRows())[i];
+                                               }
+                                               sharedRowsInitialized = 1;
+                                       }
+#ifdef HLTCA_GPU_RESCHED
+                                       short2 storeToRowBlock;
+                                       int storePosition = 0;
+                                       if (threadIdx.x < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
+                                       {
+                                               const int nReverse = threadIdx.x / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nRowBlock = threadIdx.x % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               sMem.fTrackletStoreCount[nReverse][nRowBlock] = 0;
+                                       }
+#else
+                                       mustInit = 1;
+#endif //HLTCA_GPU_RESCHED
+                                       __syncthreads();
+                                       AliHLTTPCCATrackParam tParam;
+                                       AliHLTTPCCAThreadMemory rMem;
+
+#ifdef HLTCA_GPU_EMULATION_DEBUG_TRACKLET
+                                       if (iTracklet == HLTCA_GPU_EMULATION_DEBUG_TRACKLET)
+                                       {
+                                               tracker.GPUParameters()->fGPUError = 1;
+                                       }
+#endif //HLTCA_GPU_EMULATION_DEBUG_TRACKLET
+                                       AliHLTTPCCAThreadMemory &rMemGlobal = tracker.GPUTrackletTemp()[iTracklet].fThreadMem;
+                                       AliHLTTPCCATrackParam &tParamGlobal = tracker.GPUTrackletTemp()[iTracklet].fParam;
+                                       if (mustInit)
+                                       {
+                                               AliHLTTPCCAHitId id = tracker.TrackletStartHits()[iTracklet];
+
+                                               rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex();
+                                               rMem.fCurrIH = id.HitIndex();
+                                               rMem.fStage = 0;
+                                               rMem.fNHits = 0;
+                                               rMem.fNMissed = 0;
+
+                                               AliHLTTPCCATrackletConstructor::InitTracklet(tParam);
+                                       }
+                                       else if (iTracklet >= 0)
+                                       {
+                                               CopyTrackletTempData( rMemGlobal, rMem, tParamGlobal, tParam );
+                                       }
+                                       rMem.fItr = iTracklet;
+                                       rMem.fGo = (iTracklet >= 0);
+
+#ifdef HLTCA_GPU_RESCHED
+                                       storeToRowBlock.x = iRowBlock + 1;
+                                       storeToRowBlock.y = iReverse;
+                                       if (iReverse)
+                                       {
+                                               for (int j = HLTCA_ROW_COUNT - 1 - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP;j >= CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP);j--)
+                                               {
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                                       if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(j >= rMem.fEndRow || ( j >= rMem.fStartRow && j - rMem.fStartRow % 2 == 0)))
+                                                               pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
+#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                                       if (iTracklet >= 0)
+                                                       {
+                                                               UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
+                                                               if (rMem.fNMissed > kMaxRowGap && j <= rMem.fStartRow)
+                                                               {
+                                                                       rMem.fGo = 0;
+                                                                       break;
+                                                               }
+                                                       }
+                                               }
+                                                       
+                                               if (iTracklet >= 0 && (!rMem.fGo || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP))
+                                               {
+                                                       StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam );
+                                               }
+                                       }
+                                       else
+                                       {
+                                               for (int j = CAMath::Max(1, iRowBlock * HLTCA_GPU_SCHED_ROW_STEP);j < CAMath::Min((iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP, HLTCA_ROW_COUNT);j++)
+                                               {
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                                       if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && j >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (j - rMem.fStartRow) % 2 == 0 ))
+                                                               pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
+#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                                       if (iTracklet >= 0)
+                                                       {
+                                                               UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
+                                                               //if (rMem.fNMissed > kMaxRowGap || rMem.fGo == 0) break;       //DR!!! CUDA Crashes with this enabled
+                                                       }
+                                               }
+                                               if (rMem.fGo && (rMem.fNMissed > kMaxRowGap || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP))
+                                               {
+                                                       if ( !tParam.TransportToX( sMem.fRows[ rMem.fEndRow ].X(), tracker.Param().ConstBz(), .999 ) )
+                                                       {
+                                                               rMem.fGo = 0;
+                                                       }
+                                                       else
+                                                       {
+                                                               storeToRowBlock.x = (HLTCA_ROW_COUNT - rMem.fEndRow) / HLTCA_GPU_SCHED_ROW_STEP;
+                                                               storeToRowBlock.y = 1;
+                                                               rMem.fNMissed = 0;
+                                                               rMem.fStage = 2;
+                                                       }
+                                               }
+
+                                               if (iTracklet >= 0 && !rMem.fGo)
+                                               {
+                                                       StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam );
+                                               }
+                                       }
+
+                                       if (rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0))
+                                       {
+                                               CopyTrackletTempData( rMem, rMemGlobal, tParam, tParamGlobal );
+                                               storePosition = CAMath::AtomicAdd(&sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x], 1);
+                                       }
+
+                                       __syncthreads();
+                                       if (threadIdx.x < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
+                                       {
+                                               const int nReverse = threadIdx.x / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nRowBlock = threadIdx.x % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               if (sMem.fTrackletStoreCount[nReverse][nRowBlock])
+                                               {
+                                                       sMem.fTrackletStoreCount[nReverse][nRowBlock] = CAMath::AtomicAdd(&tracker.RowBlockPos(nReverse, nRowBlock)->x, sMem.fTrackletStoreCount[nReverse][nRowBlock]);
+                                               }
+                                       }
+                                       __syncthreads();
+                                       if (iTracklet >= 0 && rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0))
+                                       {
+                                               tracker.RowBlockTracklets(storeToRowBlock.y, storeToRowBlock.x)[sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x] + storePosition] = iTracklet;
+                                       }
+                                       __syncthreads();
+#else
+                                       if (threadIdx.x % HLTCA_GPU_WARP_SIZE == 0)
+                                       {
+                                               sMem.fStartRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 160;
+                                               sMem.fEndRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 0;
+                                       }
+                                       __syncthreads();
+                                       if (iTracklet >= 0)
+                                       {
+                                               CAMath::AtomicMin(&sMem.fStartRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fStartRow);
+                                       }
+                                       __syncthreads();
+                                       if (iTracklet >= 0)
+                                       {
+                                               for (int j = sMem.fStartRows[threadIdx.x / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++)
+                                               {
+                                                       UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
+                                                       if (!rMem.fGo) break;
+                                               }
+
+                                               rMem.fNMissed = 0;
+                                               rMem.fStage = 2;
+                                               if ( rMem.fGo )
+                                               {
+                                                       if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) )  rMem.fGo = 0;
+                                               }
+                                               CAMath::AtomicMax(&sMem.fEndRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fEndRow);
+                                       }
+
+                                       __syncthreads();
+                                       if (iTracklet >= 0)
+                                       {
+                                               for (int j = rMem.fEndRow;j >= 0;j--)
+                                               {
+                                                       if (!rMem.fGo) break;
+                                                       UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
+                                               }
+
+                                               StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam );
+                                       }
+#endif //HLTCA_GPU_RESCHED
+                               }
+                       }
+               }
+       }
+}
+
+GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit(int iTracklet, AliHLTTPCCATracker &tracker)
+{
+       //Initialize Row Blocks
+
+#ifndef HLTCA_GPU_EMULATION_SINGLE_TRACKLET
+AliHLTTPCCAHitId id = tracker.TrackletStartHits()[iTracklet];
+#ifdef HLTCA_GPU_SCHED_FIXED_START
+       const int firstDynamicTracklet = tracker.GPUParameters()->fScheduleFirstDynamicTracklet;
+       if (iTracklet >= firstDynamicTracklet)
+#endif //HLTCA_GPU_SCHED_FIXED_START
+       {
+#ifdef HLTCA_GPU_SCHED_FIXED_START
+               const int firstTrackletInRowBlock = CAMath::Max(firstDynamicTracklet, tracker.RowStartHitCountOffset()[CAMath::Max(id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP * HLTCA_GPU_SCHED_ROW_STEP, 1)].z);
+#else
+               const int firstTrackletInRowBlock = tracker.RowStartHitCountOffset()[CAMath::Max(id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP * HLTCA_GPU_SCHED_ROW_STEP, 1)].z;
+#endif //HLTCA_GPU_SCHED_FIXED_START
+
+               if (iTracklet == firstTrackletInRowBlock)
+               {
+                       const int firstRowInNextBlock = (id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_SCHED_ROW_STEP;
+                       int trackletsInRowBlock;
+                       if (firstRowInNextBlock >= HLTCA_ROW_COUNT - 3)
+                               trackletsInRowBlock = *tracker.NTracklets() - firstTrackletInRowBlock;
+                       else
+#ifdef HLTCA_GPU_SCHED_FIXED_START
+                               trackletsInRowBlock = CAMath::Max(firstDynamicTracklet, tracker.RowStartHitCountOffset()[firstRowInNextBlock].z) - firstTrackletInRowBlock;
+#else
+                               trackletsInRowBlock = tracker.RowStartHitCountOffset()[firstRowInNextBlock].z - firstTrackletInRowBlock;
+#endif //HLTCA_GPU_SCHED_FIXED_START
+
+                       tracker.RowBlockPos(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)->x = trackletsInRowBlock;
+                       tracker.RowBlockPos(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)->w = trackletsInRowBlock;
+               }
+               tracker.RowBlockTracklets(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)[iTracklet - firstTrackletInRowBlock] = iTracklet;
+       }
+#endif //!HLTCA_GPU_EMULATION_SINGLE_TRACKLET
+}
+
+GPUg() void AliHLTTPCCATrackletConstructorInit(int iSlice)
+{
+       //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit
+       AliHLTTPCCATracker &tracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[iSlice];
+       int i = blockIdx.x * blockDim.x + threadIdx.x;
+       if (i >= *tracker.NTracklets()) return;
+       AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit(i, tracker);
+}
+
+#elif defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER_SIMPLE)
+
+GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, AliHLTTPCCAThreadMemory& /*rMem*/, AliHLTTPCCATrackParam& /*tParam*/)
+{
+       const int nativeslice = blockIdx.x % tracker.GPUParametersConst()->fGPUnSlices;
+       const int nTracklets = *tracker.NTracklets();
+       __syncthreads();
+       if (sMem.fNextTrackletFirstRun == 1)
+       {
+               if (threadIdx.x == 0)
+               {
+                       sMem.fNextTrackletFirst = (blockIdx.x - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT;
+                       sMem.fNextTrackletFirstRun = 0;
+               }
+       }
+       else
+       {
+               if (threadIdx.x == 0)
+               {
+                       if (tracker.GPUParameters()->fNextTracklet < nTracklets)
+                       {
+                               const int firstTracklet = CAMath::AtomicAdd(&tracker.GPUParameters()->fNextTracklet, HLTCA_GPU_THREAD_COUNT);
+                               if (firstTracklet < nTracklets) sMem.fNextTrackletFirst = firstTracklet;
+                               else sMem.fNextTrackletFirst = -2;
+                       }
+                       else
+                       {
+                               sMem.fNextTrackletFirst = -2;
+                       }
+               }
+       }
+       __syncthreads();
+       return (sMem.fNextTrackletFirst);
+}
+
+GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(AliHLTTPCCATracker *pTracker)
+{
+       const int nSlices = pTracker[0].GPUParametersConst()->fGPUnSlices;
+       const int nativeslice = blockIdx.x % nSlices;
+       GPUshared() AliHLTTPCCASharedMemory sMem;
+       int currentSlice = -1;
+
+       if (threadIdx.x)
+       {
+               sMem.fNextTrackletFirstRun = 1;
+       }
+
+       for (int iSlice = 0;iSlice < nSlices;iSlice++)
+       {
+               AliHLTTPCCATracker &tracker = pTracker[(nativeslice + iSlice) % nSlices];
+               int iRow, iRowEnd;
+
+               AliHLTTPCCATrackParam tParam;
+               AliHLTTPCCAThreadMemory rMem;
+
+               int tmpTracklet;
+               while ((tmpTracklet = FetchTracklet(tracker, sMem, rMem, tParam)) != -2)
+               {
+                       if (tmpTracklet >= 0)
+                       {
+                               rMem.fItr = tmpTracklet + threadIdx.x;
+                       }
+                       else
+                       {
+                               rMem.fItr = -1;
+                       }
+
+                       if (iSlice != currentSlice)
+                       {
+                               if (threadIdx.x == 0)
+                               {
+                                       sMem.fNTracklets = *tracker.NTracklets();
+                               }
+
+                               for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
+                               {
+                                       reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker.SliceDataRows())[i];
+                               }
+                               currentSlice = iSlice;
+                               __syncthreads();
+                       }
+
+                       if (rMem.fItr < sMem.fNTracklets)
+                       {
+                               AliHLTTPCCAHitId id = tracker.TrackletStartHits()[rMem.fItr];
+
+                               rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex();
+                               rMem.fCurrIH = id.HitIndex();
+                               rMem.fStage = 0;
+                               rMem.fNHits = 0;
+                               rMem.fNMissed = 0;
+
+                               AliHLTTPCCATrackletConstructor::InitTracklet(tParam);
+
+                               rMem.fGo = 1;
+
+
+                               iRow = rMem.fStartRow;
+                               iRowEnd = tracker.Param().NRows();
+                       }
+                       else
+                       {
+                               rMem.fGo = 0;
+                               rMem.fStartRow = rMem.fEndRow = 0;
+                               iRow = iRowEnd = 0;
+                               rMem.fStage = 0;
+                       }
+
+                       for (int k = 0;k < 2;k++)
+                       {
+                               for (;iRow != iRowEnd;iRow += rMem.fStage == 2 ? -1 : 1)
+                               {
+                                       UpdateTracklet(0, 0, 0, 0, sMem, rMem, tracker, tParam, iRow);
+                               }
+
+                               if (rMem.fStage == 2)
+                               {
+                                       if (rMem.fItr < sMem.fNTracklets)
+                                       {
+                                               StoreTracklet( 0, 0, 0, 0, sMem, rMem, tracker, tParam );
+                                       }
+                               }
+                               else
+                               {
+                                       rMem.fNMissed = 0;
+                                       rMem.fStage = 2;
+                                       if (rMem.fGo) if (!tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999)) rMem.fGo = 0;
+                                       iRow = rMem.fEndRow;
+                                       iRowEnd = -1;
+                               }
+                       }
+               }
+       }
+}
+
+#else //HLTCA_GPU_ALTERNATIVE_SCHEDULER
+
+GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, AliHLTTPCCAThreadMemory &rMem, AliHLTTPCCATrackParam &tParam)
+{
+       const int nativeslice = blockIdx.x % tracker.GPUParametersConst()->fGPUnSlices;
+       const int nTracklets = *tracker.NTracklets();
+       __syncthreads();
+       if (threadIdx.x == 0) sMem.fTrackletStorePos = 0;
+       int nStorePos = -1;
+       if (sMem.fNextTrackletFirstRun == 1)
+       {
+               if (threadIdx.x == 0)
+               {
+                       sMem.fNextTrackletFirst = (blockIdx.x - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT;
+                       sMem.fNextTrackletFirstRun = 0;
+                       sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT;
+               }
+       }
+       else
+       {
+               if (sMem.fNextTrackletCount < HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_ALTSCHED_MIN_THREADS)
+               {
+                       if (threadIdx.x == 0)
+                       {
+                               sMem.fNextTrackletFirst = -1;
+                       }
+               }
+               else
+               {
+                       __syncthreads();
+                       if (rMem.fItr != -1)
+                       {
+                               nStorePos = CAMath::AtomicAdd(&sMem.fTrackletStorePos, 1);
+                               CopyTrackletTempData(rMem, sMem.swapMemory[nStorePos].fThreadMem, tParam, sMem.swapMemory[nStorePos].fParam);
+                               rMem.fItr = -1;
+                       }
+                       if (threadIdx.x == 0)
+                       {
+                               if (tracker.GPUParameters()->fNextTracklet >= nTracklets)
+                               {
+                                       sMem.fNextTrackletFirst = -1;
+                               }
+                               else
+                               {
+                                       const int firstTracklet = CAMath::AtomicAdd(&tracker.GPUParameters()->fNextTracklet, sMem.fNextTrackletCount);
+                                       if (firstTracklet >= nTracklets)
+                                       {
+                                               sMem.fNextTrackletFirst = -1;
+                                       }
+                                       else
+                                       {
+                                               sMem.fNextTrackletFirst = firstTracklet;
+                                       }
+                               }
+                       }
+               }
+       }
+
+       if (threadIdx.x == 0)
+       {
+               if (sMem.fNextTrackletFirst == -1 && sMem.fNextTrackletCount == HLTCA_GPU_THREAD_COUNT)
+               {
+                       sMem.fNextTrackletFirst = -2;
+                       sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT;
+               }
+               else if (sMem.fNextTrackletFirst >= 0)
+               {
+                       if (sMem.fNextTrackletFirst + sMem.fNextTrackletCount >= nTracklets)
+                       {
+                               sMem.fNextTrackletCount = sMem.fNextTrackletFirst + sMem.fNextTrackletCount - nTracklets;
+                       }
+                       else
+                       {
+                               sMem.fNextTrackletCount = 0;
+                       }
+               }
+       }
+       __syncthreads();
+       if (threadIdx.x < sMem.fTrackletStorePos)
+       {
+               CopyTrackletTempData(sMem.swapMemory[threadIdx.x].fThreadMem, rMem, sMem.swapMemory[threadIdx.x].fParam, tParam);
+       }
+       return (sMem.fNextTrackletFirst);
+}
+
+GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(AliHLTTPCCATracker *pTracker)
+{
+       const int nSlices = pTracker[0].GPUParametersConst()->fGPUnSlices;
+       const int nativeslice = blockIdx.x % nSlices;
+       GPUshared() AliHLTTPCCASharedMemory sMem;
+       int currentSlice = -1;
+
+       if (threadIdx.x)
+       {
+               sMem.fNextTrackletFirstRun = 1;
+       }
+
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+       if (threadIdx.x == 0)
+       {
+               sMem.fMaxSync = 0;
+       }
+       int threadSync = 0;
+#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+
+       for (int iSlice = 0;iSlice < nSlices;iSlice++)
+       {
+               AliHLTTPCCATracker &tracker = pTracker[(nativeslice + iSlice) % nSlices];
+
+               AliHLTTPCCATrackParam tParam;
+               AliHLTTPCCAThreadMemory rMem;
+               rMem.fItr = -1;
+
+               int tmpTracklet;
+               while ((tmpTracklet = FetchTracklet(tracker, sMem, rMem, tParam)) != -2)
+               {
+
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                       CAMath::AtomicMax(&sMem.fMaxSync, threadSync);
+                                       __syncthreads();
+                                       threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / blockDim.x / gridDim.x);
+#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+
+                       if (iSlice != currentSlice)
+                       {
+                               if (threadIdx.x == 0) sMem.fNTracklets = *tracker.NTracklets();
+
+                               for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
+                               {
+                                       reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker.SliceDataRows())[i];
+                               }
+                               currentSlice = iSlice;
+                               __syncthreads();
+                       }
+
+                       if (tmpTracklet >= 0 && rMem.fItr < 0)
+                       {
+                               rMem.fItr = tmpTracklet + (signed) threadIdx.x - sMem.fTrackletStorePos;
+                               if (rMem.fItr >= sMem.fNTracklets)
+                               {
+                                       rMem.fItr = -1;
+                               }
+                               else
+                               {
+                                       AliHLTTPCCAHitId id = tracker.TrackletStartHits()[rMem.fItr];
+
+                                       rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex();
+                                       rMem.fCurrIH = id.HitIndex();
+                                       rMem.fStage = 0;
+                                       rMem.fNHits = 0;
+                                       rMem.fNMissed = 0;
+
+                                       AliHLTTPCCATrackletConstructor::InitTracklet(tParam);
+
+                                       rMem.fGo = 1;
+
+                                       rMem.fIRow = rMem.fStartRow;
+                                       rMem.fIRowEnd = tracker.Param().NRows();
+                               }
+                       }
+
+                       if (rMem.fItr >= 0)
+                       {
+                               for (int j = 0;j < HLTCA_GPU_ALTSCHED_STEPSIZE && rMem.fIRow != rMem.fIRowEnd;j++,rMem.fIRow += rMem.fStage == 2 ? -1 : 1)
+                               {
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                       if (rMem.fStage == 2)
+                                       {
+                                               if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(rMem.fIRow >= rMem.fEndRow || ( rMem.fIRow >= rMem.fStartRow && rMem.fIRow - rMem.fStartRow % 2 == 0)))
+                                                       pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
+                                       }
+                                       else
+                                       {
+                                               if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && rMem.fIRow >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (rMem.fIRow - rMem.fStartRow) % 2 == 0 ))
+                                                       pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
+                                       }
+#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                       UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, rMem.fIRow);
+                               }
+
+                               if (rMem.fIRow == rMem.fIRowEnd || rMem.fNMissed > kMaxRowGap)
+                               {
+                                       if (rMem.fStage >= 2)
+                                       {
+                                               rMem.fGo = 0;
+                                       }
+                                       else if (rMem.fGo)
+                                       {
+                                               rMem.fNMissed = 0;
+                                               rMem.fStage = 2;
+                                               if (!tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999)) rMem.fGo = 0;
+                                               rMem.fIRow = rMem.fEndRow;
+                                               rMem.fIRowEnd = -1;
+                                       }
+                               }
+
+                               if (!rMem.fGo)
+                               {
+                                       StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam );
+                                       rMem.fItr = -1;
+                                       CAMath::AtomicAdd(&sMem.fNextTrackletCount, 1);
+                               }
+                       }
+               }
+       }
+}
+
+#endif //HLTCA_GPU_ALTERNATIVE_SCHEDULER
+
+GPUg() void AliHLTTPCCATrackletConstructorGPU()
+{
+       //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU
+       AliHLTTPCCATracker *pTracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker );
+       AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(pTracker);
+}
+
+GPUg() void AliHLTTPCCATrackletConstructorGPUPP(int firstSlice, int sliceCount)
+{
+       if (blockIdx.x >= sliceCount) return;
+       AliHLTTPCCATracker *pTracker = &( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[firstSlice + blockIdx.x];
+       AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP(pTracker);
+}
+
+GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP(AliHLTTPCCATracker *tracker)
+{
+       GPUshared() AliHLTTPCCASharedMemory sMem;
+#if defined(HLTCA_GPU_RESCHED) & !defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER)
+#define startRows sMem.fStartRows
+#define endRows sMem.fEndRows
+#else
+       GPUshared() int startRows[HLTCA_GPU_THREAD_COUNT / HLTCA_GPU_WARP_SIZE + 1];
+       GPUshared() int endRows[HLTCA_GPU_THREAD_COUNT / HLTCA_GPU_WARP_SIZE + 1];
+#endif
+       sMem.fNTracklets = *tracker->NTracklets();
+
+       for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
+       {
+               reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker->SliceDataRows())[i];
+       }
+
+       for (int iTracklet = threadIdx.x;iTracklet < (*tracker->NTracklets() / HLTCA_GPU_THREAD_COUNT + 1) * HLTCA_GPU_THREAD_COUNT;iTracklet += blockDim.x)
+       {
+               AliHLTTPCCATrackParam tParam;
+               AliHLTTPCCAThreadMemory rMem;
+               
+               if (iTracklet < *tracker->NTracklets())
+               {
+                       AliHLTTPCCAHitId id = tracker->TrackletTmpStartHits()[iTracklet];
+
+                       rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex();
+                       rMem.fCurrIH = id.HitIndex();
+                       rMem.fStage = 0;
+                       rMem.fNHits = 0;
+                       rMem.fNMissed = 0;
+
+                       AliHLTTPCCATrackletConstructor::InitTracklet(tParam);
+
+                       rMem.fItr = iTracklet;
+                       rMem.fGo = 1;
+               }
+
+               if (threadIdx.x % HLTCA_GPU_WARP_SIZE == 0)
+               {
+                       startRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 160;
+                       endRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 0;
+               }
+               __syncthreads();
+               if (iTracklet < *tracker->NTracklets())
+               {
+                       CAMath::AtomicMin(&startRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fStartRow);
+               }
+               __syncthreads();
+               if (iTracklet < *tracker->NTracklets())
+               {
+                       for (int j = startRows[threadIdx.x / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++)
+                       {
+                               UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, *tracker, tParam, j);
+                               if (!rMem.fGo) break;
+                       }
+
+                       rMem.fNMissed = 0;
+                       rMem.fStage = 2;
+                       if ( rMem.fGo )
+                       {
+                               if ( !tParam.TransportToX( tracker->Row( rMem.fEndRow ).X(), tracker->Param().ConstBz(), .999 ) )  rMem.fGo = 0;
+                       }
+                       CAMath::AtomicMax(&endRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fEndRow);
+               }
+
+               __syncthreads();
+               if (iTracklet < *tracker->NTracklets())
+               {
+                       for (int j = rMem.fEndRow;j >= 0;j--)
+                       {
+                               if (!rMem.fGo) break;
+                               UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, *tracker, tParam, j);
+                       }
+                       StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, *tracker, tParam );
+               }
+       }
+}
\ No newline at end of file
diff --git a/HLT/TPCLib/tracking-ca/cagpu/gpu_tracker_install.txt b/HLT/TPCLib/tracking-ca/cagpu/gpu_tracker_install.txt
new file mode 100755 (executable)
index 0000000..83ea419
--- /dev/null
@@ -0,0 +1,23 @@
+The GPU Tracker is contained in an extra library AliHLTTPCCAGPU that must be build in addition to the HLT libraries\r
+The HLT Tracker Framework will automatically try to load AliHLTTPCCAGPU from LD_LIBRARY_PATH\r
+AliHLTTPCCAGPU will further try to load the CUDA runtime library which needs to be installed and pointed to by LD_LIBRARY_PATH\r
+If the -allowGPU option for the AliHLTTPCCATrackerComponent is set, the tracker framework will automatically try run the GPU tracker.\r
+If the GPU tracker is not available (because no GPU is present, or because all the GPU memory is used by other processes) the framework will automatically fall back to the CPU tracker\r
+\r
+Therefore to use the GPU tracker the following steps are needed:\r
+- The NVIDIA Driver must be installed and the kernel modules loaded\r
+- The CUDA Toolkit must be installed in /usr/local/cuda and the LD_LIBRARY_PATH must point to it\r
+- The library AliHLTTPCCAGPU must be compiled and LD_LIBRARY_PATH point to it\r
+- the -allowGPU option must be set for the TrackerComponent in the HLT configuration\r
+\r
+How to build the library AliHLTTPCCAGPU:\r
+The library is not build automatically by the AliROOT or HLT standalone build\r
+It can be found at via svn at https://qon.zapto.org/var/svn/catracker/catracker/standalone/cagpubuild\r
+The Variables ROOTSYS AND ALIHLT_TOPDIR must be set for the makefile to work, the cuda compiler nvcc must be present in the PATH variable\r
+The makefile builds the library and stores the shared object file in the current directory\r
+\r
+To build the library the following is needed\r
+- Checkout the library from svn\r
+- Make sure the environment variables ALIHLT_TOPDIR and ROOTSYS are set\r
+- The CUDA compiler must be installed in /usr/local/cuda and available in the PATH variable (you can change the path in the makefile)\r
+- run make\r
diff --git a/HLT/TPCLib/tracking-ca/cagpu/makefile b/HLT/TPCLib/tracking-ca/cagpu/makefile
new file mode 100755 (executable)
index 0000000..447942a
--- /dev/null
@@ -0,0 +1,36 @@
+all:                                                                   libAliHLTTPCCAGPU.so
+
+clean:
+                                                                               rm -f libAliHLTTPCCAGPU.so AliHLTTPCCAGPUTrackerNVCC.o G__AliHLTTPCCAGPU.o AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx AliHLTTPCCAGPUTrackerNVCC.cu.cxx G__AliHLTTPCCAGPUAutoLinkDef.h G__AliHLTTPCCAGPU.h G__AliHLTTPCCAGPU.cxx
+
+
+libAliHLTTPCCAGPU.so:                                  AliHLTTPCCAGPUTrackerNVCC.o G__AliHLTTPCCAGPU.o
+                                                                               c++ -shared AliHLTTPCCAGPUTrackerNVCC.o G__AliHLTTPCCAGPU.o -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L${ALICE_ROOT}/lib/tgt_${ALICE_TARGET} -L. -lcuda -lcudart -lAliHLTTPC -o libAliHLTTPCCAGPU.so 
+
+
+AliHLTTPCCAGPUTrackerNVCC.o:                   AliHLTTPCCAGPUTrackerNVCC.cu.cxx
+                                                                               c++ -fPIC -DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -W -Weffc++ -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -c AliHLTTPCCAGPUTrackerNVCC.cu.cxx -o AliHLTTPCCAGPUTrackerNVCC.o
+
+G__AliHLTTPCCAGPU.cxx:                                 G__AliHLTTPCCAGPUAtoLinkDef.h
+                                                                               rootcint -f G__AliHLTTPCCAGPU.cxx -c -Duse_aliroot -Duse_root -DROWHOUGHPARAMS -Duse_reconstruction -Duse_newio -DROOTVERSION=\"unchecked\" -DALIROOTVERSION=\"unchecked\" -D__ROOT__ -DUSE_ALILOG -DLINUX -DNDEBUG -D_MODULE_=\"HLT\" -D`uname` -DDATE_SYS=`uname` -Dlong32='int' -Dlong64='long long' -DdatePointer='long' -I${ROOTSYS}/include -pthread -m64 -DWITHXML -DWITHXML -DUSE_ROOT -DWITHXML -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/BASE/util -I${ALICE_ROOT}/HLT -I${ALICE_ROOT}/HLT/TPCLib -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca AliHLTTPCCAGPUTrackerNVCC.h G__AliHLTTPCCAGPUAutoLinkDef.h
+
+G__AliHLTTPCCAGPUAtoLinkDef.h:                 AliHLTTPCCAGPUTrackerNVCC.h AliHLTTPCCAGPUTrackerNVCC.cu
+                                                                               echo '//automatically generated ROOT DICT definition' > G__AliHLTTPCCAGPUAutoLinkDef.h
+                                                                               echo '//!!! DO NOT EDIT THIS FILE !!!' >> G__AliHLTTPCCAGPUAutoLinkDef.h
+                                                                               echo '#ifdef __CINT__' >> G__AliHLTTPCCAGPUAutoLinkDef.h
+                                                                               echo '#pragma link off all globals;' >> G__AliHLTTPCCAGPUAutoLinkDef.h
+                                                                               echo '#pragma link off all classes;' >> G__AliHLTTPCCAGPUAutoLinkDef.h
+                                                                               echo '#pragma link off all functions;' >> G__AliHLTTPCCAGPUAutoLinkDef.h
+                                                                               echo "#pragma link C++ class AliHLTTPCCAGPUTrackerNVCC+;" >> G__AliHLTTPCCAGPUAutoLinkDef.h
+                                                                               echo '#endif' >> G__AliHLTTPCCAGPUAutoLinkDef.h
+
+G__AliHLTTPCCAGPU.o:                                   G__AliHLTTPCCAGPU.cxx
+                                                                               g++ -DcudaError_t=int -Duse_aliroot -Duse_root -DROWHOUGHPARAMS -Duse_reconstruction -Duse_newio -DROOTVERSION=\"unchecked\" -DALIROOTVERSION=\"unchecked\" -D__ROOT__ -DUSE_ALILOG -DLINUX -DNDEBUG -DBUILD_GPU -D_MODULE_=\"HLT\" -I${ALICE_ROOT}/HLT/TPCLib -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ALICE_ROOT}/HLT/BASE -c G__AliHLTTPCCAGPU.cxx -o G__AliHLTTPCCAGPU.o -O -g -W -Wall -Weffc++ -fPIC -pipe -fmessage-length=0 -Wno-long-long -ansi -Dlinux -D`uname` -DDATE_SYS=`uname` -Dlong32='int' -Dlong64='long long' -DdatePointer='long' -I${ROOTSYS}/include -pthread -m64 -D__PHOSUTIL__ -D__EMCALUTIL__
+
+AliHLTTPCCAGPUTrackerNVCC.cu.cxx:              AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx
+                                                                               cat AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx | grep -v "^#" > AliHLTTPCCAGPUTrackerNVCC.cu.cxx
+                                                                               -patch -r /dev/null -s --no-backup-if-mismatch -i AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch AliHLTTPCCAGPUTrackerNVCC.cu.cxx
+
+AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx:  AliHLTTPCCAGPUTrackerNVCC.cu
+                                                                               nvcc --cuda --use_fast_math --maxrregcount 64 -O4 -Xptxas -v -Xptxas -O4 -gencode arch=compute_20,code=sm_20 --compiler-options "-DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -IRCU -W -Weffc++ -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ROOTSYS}/include" -I. AliHLTTPCCAGPUTrackerNVCC.cu --output-file AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx
+