bug fix: reconstruction crash when the output buffer size exceed

[u/mrichter/AliRoot.git] / HLT / TPCLib / tracking-ca / AliHLTTPCCAGPUTrackerNVCC.cu
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu

index 6475ffafb5ebde713053fe159a09249e00d615d9..0713bec1acf9626a638d6a1642f5c7f2f03bfe47 100644 (file)
--- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu
+++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu
@@ -17,15 +17,24 @@
  //                                                                          *
  //***************************************************************************
  
-#include "AliHLTTPCCAGPUTracker.h"
+#include "AliHLTTPCCAGPUTrackerNVCC.h"
  
-#ifdef BUILD_GPU
+#ifdef HLTCA_GPUCODE
+#include <cuda.h>
+#include <sm_11_atomic_functions.h>
+#include <sm_12_atomic_functions.h>
+#endif
+
+#ifdef R__WIN32
+#else
+#include <sys/syscall.h>
+#include <semaphore.h>
+#include <fcntl.h>
+#endif
  
  #include "AliHLTTPCCADef.h"
  #include "AliHLTTPCCAGPUConfig.h"
  
-#include <sm_11_atomic_functions.h>
-#include <sm_12_atomic_functions.h>
  
  #include <iostream>
  
@@ -42,26 +51,20 @@ texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
  texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
  #endif
  
-#include "AliHLTTPCCAHit.h"
-
  //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
  #include "AliHLTTPCCATrackParam.cxx"
  #include "AliHLTTPCCATrack.cxx" 
  
-#include "AliHLTTPCCATrackletSelector.cxx"
-
  #include "AliHLTTPCCAHitArea.cxx"
  #include "AliHLTTPCCAGrid.cxx"
  #include "AliHLTTPCCARow.cxx"
  #include "AliHLTTPCCAParam.cxx"
  #include "AliHLTTPCCATracker.cxx"
  
-#include "AliHLTTPCCAOutTrack.cxx"
-
  #include "AliHLTTPCCAProcess.h"
  
+#include "AliHLTTPCCATrackletSelector.cxx"
  #include "AliHLTTPCCANeighboursFinder.cxx"
-
  #include "AliHLTTPCCANeighboursCleaner.cxx"
  #include "AliHLTTPCCAStartHitsFinder.cxx"
  #include "AliHLTTPCCAStartHitsSorter.cxx"
@@ -75,45 +78,142 @@ texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
  #include "AliHLTSystem.h"
  #endif
  
-ClassImp( AliHLTTPCCAGPUTracker )
+ClassImp( AliHLTTPCCAGPUTrackerNVCC )
+
+bool AliHLTTPCCAGPUTrackerNVCC::fgGPUUsed = false;
+
+#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
+
+AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :
+       fGpuTracker(NULL),
+       fGPUMemory(NULL),
+       fHostLockedMemory(NULL),
+       fDebugLevel(0),
+       fOutFile(NULL),
+       fGPUMemSize(0),
+       fpCudaStreams(NULL),
+       fSliceCount(0),
+       fOutputControl(NULL),
+       fThreadId(0),
+       fCudaInitialized(0)
+       {};
+
+AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() {};
+
+void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)
+{
+       //Release the global named semaphore that locks GPU Initialization
+#ifdef R__WIN32
+       HANDLE* h = (HANDLE*) sem;
+       ReleaseSemaphore(*h, 1, NULL);
+       CloseHandle(*h);
+       delete h;
+#else
+       sem_t* pSem = (sem_t*) sem;
+       sem_post(pSem);
+       sem_unlink(SemLockName);
+#endif
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)
+{
+       //Check constants for correct memory sizes
+  if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
+  {
+         HLTError("Insufficiant Tracker Object Memory");
+         return(1);
+  }
+
+  if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
+  {
+         HLTError("Insufficiant Common Memory");
+         return(1);
+  }
  
-bool AliHLTTPCCAGPUTracker::fgGPUUsed = false;
+  if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
+  {
+         HLTError("Insufficiant Row Memory");
+         return(1);
+  }
+  return(0);
+}
  
-int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
+int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
  {
         //Find best CUDA device, initialize and allocate memory
-       
+
+       if (CheckMemorySizes(sliceCount)) return(1);
+
+#ifdef R__WIN32
+       HANDLE* semLock = new HANDLE;
+       *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
+       if (*semLock == NULL)
+       {
+               HLTError("Error creating GPUInit Semaphore");
+               return(1);
+       }
+       WaitForSingleObject(*semLock, INFINITE);
+#else
+       sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
+       if (semLock == SEM_FAILED)
+       {
+               HLTError("Error creating GPUInit Semaphore");
+               return(1);
+       }
+       sem_wait(semLock);
+#endif
+
         if (fgGPUUsed)
         {
             HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
+               ReleaseGlobalLock(semLock);
             return(1);
         }
+       fgGPUUsed = 1;
+       fThreadId = GetThread();
  
         cudaDeviceProp fCudaDeviceProp;
  
+       fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
+
  #ifndef CUDA_DEVICE_EMULATION
-       int count, bestDevice = -1, bestDeviceSpeed = 0;
+       int count, bestDevice = -1;
+       long long int bestDeviceSpeed = 0, deviceSpeed;
         if (CudaFailedMsg(cudaGetDeviceCount(&count)))
         {
                 HLTError("Error getting CUDA Device Count");
+               fgGPUUsed = 0;
+               ReleaseGlobalLock(semLock);
                 return(1);
         }
-       if (fDebugLevel >= 2) std::cout << "Available CUDA devices: ";
+       if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
         for (int i = 0;i < count;i++)
         {
-               cudaGetDeviceProperties(&fCudaDeviceProp, i);
-               if (fDebugLevel >= 2) std::cout << fCudaDeviceProp.name << " (" << i << ")     ";
-               if (fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate > bestDeviceSpeed)
+               unsigned int free, total;
+               cuInit(0);
+               CUdevice tmpDevice;
+               cuDeviceGet(&tmpDevice, i);
+               CUcontext tmpContext;
+               cuCtxCreate(&tmpContext, 0, tmpDevice);
+               if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
+               cuCtxDestroy(tmpContext);
+               CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i));
+
+               int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;
+
+               if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
+               deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;
+               if (deviceOK && deviceSpeed > bestDeviceSpeed)
                 {
                         bestDevice = i;
-                       bestDeviceSpeed = fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate;
+                       bestDeviceSpeed = deviceSpeed;
                 }
         }
-       if (fDebugLevel >= 2) std::cout << std::endl;
-
         if (bestDevice == -1)
         {
                 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
+               fgGPUUsed = 0;
+               ReleaseGlobalLock(semLock);
                 return(1);
         }
  
@@ -130,56 +230,47 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
  
    if (fDebugLevel >= 1)
    {
-         std::cout<<"CUDA Device Properties: "<<std::endl;
-         std::cout<<"name = "<<fCudaDeviceProp.name<<std::endl;
-         std::cout<<"totalGlobalMem = "<<fCudaDeviceProp.totalGlobalMem<<std::endl;
-         std::cout<<"sharedMemPerBlock = "<<fCudaDeviceProp.sharedMemPerBlock<<std::endl;
-         std::cout<<"regsPerBlock = "<<fCudaDeviceProp.regsPerBlock<<std::endl;
-         std::cout<<"warpSize = "<<fCudaDeviceProp.warpSize<<std::endl;
-         std::cout<<"memPitch = "<<fCudaDeviceProp.memPitch<<std::endl;
-         std::cout<<"maxThreadsPerBlock = "<<fCudaDeviceProp.maxThreadsPerBlock<<std::endl;
-         std::cout<<"maxThreadsDim = "<<fCudaDeviceProp.maxThreadsDim[0]<<" "<<fCudaDeviceProp.maxThreadsDim[1]<<" "<<fCudaDeviceProp.maxThreadsDim[2]<<std::endl;
-         std::cout<<"maxGridSize = "  <<fCudaDeviceProp.maxGridSize[0]<<" "<<fCudaDeviceProp.maxGridSize[1]<<" "<<fCudaDeviceProp.maxGridSize[2]<<std::endl;
-         std::cout<<"totalConstMem = "<<fCudaDeviceProp.totalConstMem<<std::endl;
-         std::cout<<"major = "<<fCudaDeviceProp.major<<std::endl;
-         std::cout<<"minor = "<<fCudaDeviceProp.minor<<std::endl;
-         std::cout<<"clockRate = "<<fCudaDeviceProp.clockRate<<std::endl;
-         std::cout<<"textureAlignment = "<<fCudaDeviceProp.textureAlignment<<std::endl;
+         HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
+         HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);
+         HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);
+         HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
+         HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
+         HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);
+         HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
+         HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
+         HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
+         HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);
+         HLTInfo("major = %d", fCudaDeviceProp.major);
+         HLTInfo("minor = %d", fCudaDeviceProp.minor);
+         HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);
+         HLTInfo("textureAlignment %lld= ", (unsigned long long int) fCudaDeviceProp.textureAlignment);
    }
  
    if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
    {
         HLTError( "Unsupported CUDA Device" );
-         return(1);
+       fgGPUUsed = 0;
+       ReleaseGlobalLock(semLock);
+       return(1);
    }
  
    if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
    {
           HLTError("Could not set CUDA Device!");
+         fgGPUUsed = 0;
+         ReleaseGlobalLock(semLock);
           return(1);
    }
  
-  if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
-  {
-         HLTError("Insufficiant Common Memory");
-         cudaThreadExit();
-         return(1);
-  }
-
-  if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
-  {
-         HLTError("Insufficiant Row Memory");
-         cudaThreadExit();
-         return(1);
-  }
-
-  fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
    if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
    {
           HLTError("CUDA Memory Allocation Error");
           cudaThreadExit();
+         fgGPUUsed = 0;
+         ReleaseGlobalLock(semLock);
           return(1);
    }
+  ReleaseGlobalLock(semLock);
    if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
    int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
    if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
@@ -187,6 +278,7 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
           cudaFree(fGPUMemory);
           cudaThreadExit();
           HLTError("Error allocating Page Locked Host Memory");
+         fgGPUUsed = 0;
           return(1);
    }
    if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
@@ -195,22 +287,16 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
    {
           CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
    }
-  HLTInfo("CUDA Initialisation successfull");
  
-  //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
-  if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
-  {
-         HLTError("Insufficiant Tracker Object Memory");
-         return(1);
-  }
    fSliceCount = sliceCount;
+  //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
    fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
  
    for (int i = 0;i < fgkNSlices;i++)
    {
      fSlaveTrackers[i].SetGPUTracker();
         fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
-       fSlaveTrackers[i].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
+       fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
    }
  
    fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
@@ -219,42 +305,70 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
    {
         if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
         {
-               HLTError("Error creating CUDA Stream");
-               return(1);
+           cudaFree(fGPUMemory);
+           cudaFreeHost(fHostLockedMemory);
+           cudaThreadExit();
+           HLTError("Error creating CUDA Stream");
+           fgGPUUsed = 0;
+           return(1);
         }
    }
  
+  fCudaInitialized = 1;
+  HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d)", cudaDevice, fCudaDeviceProp.name, fThreadId);
+
  #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
    if (fDebugLevel < 2)
    {
           //Do one initial run for Benchmark reasons
           const int useDebugLevel = fDebugLevel;
           fDebugLevel = 0;
-         AliHLTTPCCAClusterData tmpCluster;
+         AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
  
           std::ifstream fin;
-         fin.open("events/event.0.dump");
-         tmpCluster.ReadEvent(fin);
-         fin.close();
  
-         AliHLTTPCCASliceOutput *tmpOutput = NULL;
           AliHLTTPCCAParam tmpParam;
           AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
-         fSlaveTrackers[0].SetOutputControl(&tmpOutputControl);
-         tmpParam.SetNRows(HLTCA_ROW_COUNT);
-         fSlaveTrackers[0].SetParam(tmpParam);
-         Reconstruct(&tmpOutput, &tmpCluster, 0, 1);
-         free(tmpOutput);
-         tmpOutput = NULL;
-         fSlaveTrackers[0].SetOutputControl(NULL);
+
+         fin.open("events/settings.dump");
+         int tmpCount;
+         fin >> tmpCount;
+         for (int i = 0;i < sliceCount;i++)
+         {
+               fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
+               tmpParam.ReadSettings(fin);
+               InitializeSliceParam(i, tmpParam);
+         }
+         fin.close();
+
+         fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
+         for (int i = 0;i < sliceCount;i++)
+         {
+               tmpCluster[i].StartReading(i, 0);
+               tmpCluster[i].ReadEvent(fin);
+               tmpCluster[i].FinishReading();
+         }
+         fin.close();
+
+         AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
+         memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
+
+         Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
+         for (int i = 0;i < sliceCount;i++)
+         {
+                 free(tmpOutput[i]);
+                 tmpOutput[i] = NULL;
+                 fSlaveTrackers[i].SetOutputControl(NULL);
+         }
+         delete[] tmpOutput;
+         delete[] tmpCluster;
           fDebugLevel = useDebugLevel;
    }
  #endif
-    fgGPUUsed = true;
    return(0);
  }
  
-template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment)
+template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)
  {
         //Macro to align Pointers.
         //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
@@ -268,7 +382,7 @@ template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int ali
         return((T*) adr);
  }
  
-bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
+bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsg(cudaError_t error)
  {
         //Check for CUDA Error and in the case of an error display the corresponding error string
         if (error == cudaSuccess) return(false);
@@ -276,7 +390,7 @@ bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
         return(true);
  }
  
-int AliHLTTPCCAGPUTracker::CUDASync(char* state)
+int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state)
  {
         //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
  
@@ -297,14 +411,14 @@ int AliHLTTPCCAGPUTracker::CUDASync(char* state)
         return(0);
  }
  
-void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
+void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
  {
         //Set Debug Level and Debug output File if applicable
         fDebugLevel = dwLevel;
         if (NewOutFile) fOutFile = NewOutFile;
  }
  
-int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
+int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
  {
         //Set a specific GPU Tracker Option
         {
@@ -316,7 +430,7 @@ int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int /*OptionVal
  }
  
  #ifdef HLTCA_STANDALONE
-void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
+void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)
  {
    //Run Performance Query for timer i of slice iSlice
    if (fDebugLevel >= 1)
@@ -325,10 +439,10 @@ void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
    }
  }
  #else
-void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
+void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
  #endif
  
-void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
+void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
  {
         //Dump Rowblocks to File
         if (fDebugLevel >= 4)
@@ -347,7 +461,7 @@ void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlic
                 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
                 {
                         *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << 
-                               rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << endl << "Phase 1: ";
+                               rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
                         for (int j = 0;j < rowBlockPos[i].x;j++)
                         {
                                 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
@@ -362,12 +476,12 @@ void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlic
                                         HLTError("Error, -1 Tracklet found");
                                 }
                         }
-                       *fOutFile << endl << "Phase 2: ";
+                       *fOutFile << std::endl << "Phase 2: ";
                         for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
                         {
                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
                         }
-                       *fOutFile << endl;
+                       *fOutFile << std::endl;
                 }
  
                 if (check)
@@ -402,12 +516,30 @@ __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTra
                 sliceDataHitWeights4[i] = i0;
  }
  
-int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
+int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
  {
         //Primary reconstruction function
         cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
  
-       if (sliceCountLocal == -1) sliceCountLocal = this->fSliceCount;
+       if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
+       
+       if (!fCudaInitialized)
+       {
+           HLTError("GPUTracker not initialized");
+           return(1);
+       }
+       if (sliceCountLocal > fSliceCount)
+       {
+           HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
+           return(1);
+       }
+       if (fThreadId != GetThread())
+       {
+           HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());
+           return(1);
+       }
+
+       if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
  
         if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
         {
@@ -419,19 +551,18 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
         {
                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
                 {
-                       *fOutFile << endl << endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << endl;
+                       *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
                 }
         }
  
         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
  
-       if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice + sliceCountLocal].Param().ISlice());
         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
  
  #ifdef HLTCA_GPU_TIME_PROFILE
-       __int64 a, b, c, d;
-       QueryPerformanceFrequency((LARGE_INTEGER*) &c);
-       QueryPerformanceCounter((LARGE_INTEGER*) &d);
+       unsigned __int64 a, b, c, d;
+       AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
+       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
  #endif
         
         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
@@ -439,8 +570,8 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
                 //Make this a GPU Tracker
                 fGpuTracker[iSlice].SetGPUTracker();
                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
-               fGpuTracker[iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
-               fGpuTracker[iSlice].pData()->SetPointers(&pClusterData[iSlice], false);
+               fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
+               fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
  
                 //Set Pointers to GPU Memory
                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
@@ -478,7 +609,7 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
                 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
                 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
-               fGpuTracker[iSlice].pData()->SetGPUTextureBase(fGpuTracker[0].Data().Memory());
+               fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
         }
  
  #ifdef HLTCA_GPU_TEXTURE_FETCH
@@ -486,19 +617,19 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
                 size_t offset;
                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
                 {
-                       HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+                       HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
                         return(1);
                 }
                 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
                 {
-                       HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+                       HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
                         return(1);
                 }
                 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
                 {
-                       HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+                       HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
                         return(1);
                 }
  #endif
@@ -517,13 +648,21 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
  
                 //Initialize GPU Slave Tracker
                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
-               fSlaveTrackers[firstSlice + iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
+               fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
  #ifdef HLTCA_GPU_TIME_PROFILE
-               QueryPerformanceCounter((LARGE_INTEGER*) &a);
+                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
  #endif
                 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
+
+                 if (fDebugLevel >= 4)
+                 {
+                         *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
+                         *fOutFile << "Slice Data:" << std::endl;
+                         fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);
+                 }
+
  #ifdef HLTCA_GPU_TIME_PROFILE
-               QueryPerformanceCounter((LARGE_INTEGER*) &b);
+                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
                 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
  #endif
                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
@@ -551,10 +690,8 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
                 if (fDebugLevel >= 4)
                 {
                         if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
-                       fSlaveTrackers[firstSlice + iSlice].TrackletMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] );
-                       fSlaveTrackers[firstSlice + iSlice].SetPointersTracklets( HLTCA_GPU_MAX_TRACKLETS );
-                       fSlaveTrackers[firstSlice + iSlice].HitMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100] );
-                       fSlaveTrackers[firstSlice + iSlice].SetPointersHits( pClusterData[iSlice].NumberOfClusters() );
+                       fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS);
+                       fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
                 }
                 
                 if (CUDASync("Initialization")) return(1);
@@ -652,6 +789,9 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
         }
  #endif
  
+       int nHardCollisions = 0;
+
+RestartTrackletConstructor:
         if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
         {
@@ -682,10 +822,12 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
                 }
         }
  
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT)
+       int runSlices = 0;
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
         {
-               if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT);
-               AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT, sliceCountLocal - iSlice));
+               if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
+               if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
+               AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
         }
         if (CUDASync("Tracklet Selector")) return 1;
         StandalonePerfTime(firstSlice, 9);
@@ -718,6 +860,21 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
  
                         if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
                         {
+                               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION && nHardCollisions++ < 10)
+                               {
+                                       HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");
+                                       for (int i = 0;i < sliceCountLocal;i++)
+                                       {
+                                               cudaThreadSynchronize();
+                                               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
+                                               *fSlaveTrackers[firstSlice + i].NTracks() = 0;
+                                               *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
+                                               fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
+                                               CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
+                                               PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
+                                       }
+                                       goto RestartTrackletConstructor;
+                               }
                                 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
                                 return(1);
                         }
@@ -725,11 +882,11 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
  
                         fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
  #ifdef HLTCA_GPU_TIME_PROFILE
-                       QueryPerformanceCounter((LARGE_INTEGER*) &a);
+                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
  #endif
                         fSlaveTrackers[firstSlice + iSlice].WriteOutput();
  #ifdef HLTCA_GPU_TIME_PROFILE
-                       QueryPerformanceCounter((LARGE_INTEGER*) &b);
+                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
                         printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
  #endif
  
@@ -806,7 +963,7 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
         return(0);
  }
  
-int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
+int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
  {
         //Initialize Slice Tracker Parameter for a slave tracker
         fSlaveTrackers[iSlice].Initialize(param);
@@ -818,7 +975,7 @@ int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &pa
         return(0);
  }
  
-int AliHLTTPCCAGPUTracker::ExitGPU()
+int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
  {
         //Uninitialize CUDA
         cudaThreadSynchronize();
@@ -845,11 +1002,13 @@ int AliHLTTPCCAGPUTracker::ExitGPU()
         }
         HLTInfo("CUDA Uninitialized");
         fgGPUUsed = false;
+       fCudaInitialized = 0;
         return(0);
  }
  
-void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
+void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
  {
+       //Set Output Control Pointers
         fOutputControl = val;
         for (int i = 0;i < fgkNSlices;i++)
         {
@@ -857,4 +1016,40 @@ void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputCont
         }
  }
  
-#endif
-\ No newline at end of file
+int AliHLTTPCCAGPUTrackerNVCC::GetThread()
+{
+       //Get Thread ID
+#ifdef R__WIN32
+       return((int) (size_t) GetCurrentThread());
+#else
+       return((int) syscall (SYS_gettid));
+#endif
+}
+
+unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)
+{
+       //Returns pointer to PerfTimer i of slice iSlice
+       return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);
+}
+
+const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const
+{
+       //Return Pointer to Output Control Structure
+       return fOutputControl;
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const
+{
+       //Return max slice count processable
+       return(fSliceCount);
+}
+
+AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
+{
+       return new AliHLTTPCCAGPUTrackerNVCC;
+}
+void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)
+{
+       delete ptr;
+}
+