]> git.uio.no Git - u/mrichter/AliRoot.git/blobdiff - HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu
GPU tracker update
[u/mrichter/AliRoot.git] / HLT / TPCLib / tracking-ca / AliHLTTPCCAGPUTrackerNVCC.cu
index 9faf18c74008b85876cc8a39ec35be4766ac1435..b8327b5bae7f67868db1f7115514bc2c66fdda12 100644 (file)
-// **************************************************************************
-// This file is property of and copyright by the ALICE HLT Project          *
-// ALICE Experiment at CERN, All rights reserved.                           *
-//                                                                          *
-// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
-//                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
-//                                     David Rohr <drohr@kip.uni-heidelberg.de>                                *
-//                  for The ALICE HLT Project.                              *
-//                                                                          *
-// Permission to use, copy, modify and distribute this software and its     *
-// documentation strictly for non-commercial purposes is hereby granted     *
-// without fee, provided that the above copyright notice appears in all     *
-// copies and that both the copyright notice and this permission notice     *
-// appear in the supporting documentation. The authors make no claims       *
-// about the suitability of this software for any purpose. It is            *
-// provided "as is" without express or implied warranty.                    *
-//                                                                          *
-//***************************************************************************
-
-#include "AliHLTTPCCAGPUTracker.h"
-
-#ifdef BUILD_GPU
-
-#include <cuda.h>
-#ifdef R__WIN32
-
-#else
-#include <sys/syscall.h>
-#endif
-
-#include "AliHLTTPCCADef.h"
-#include "AliHLTTPCCAGPUConfig.h"
-
-#include <sm_11_atomic_functions.h>
-#include <sm_12_atomic_functions.h>
-
-#include <iostream>
-
-//Disable assertions since they produce errors in GPU Code
-#ifdef assert
-#undef assert
-#endif
-#define assert(param)
-
-__constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
-#ifdef HLTCA_GPU_TEXTURE_FETCH
-texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
-texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
-texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
-#endif
-
-//Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
-#include "AliHLTTPCCATrackParam.cxx"
-#include "AliHLTTPCCATrack.cxx" 
-
-#include "AliHLTTPCCAHitArea.cxx"
-#include "AliHLTTPCCAGrid.cxx"
-#include "AliHLTTPCCARow.cxx"
-#include "AliHLTTPCCAParam.cxx"
-#include "AliHLTTPCCATracker.cxx"
-
-#include "AliHLTTPCCAProcess.h"
-
-#include "AliHLTTPCCATrackletSelector.cxx"
-#include "AliHLTTPCCANeighboursFinder.cxx"
-#include "AliHLTTPCCANeighboursCleaner.cxx"
-#include "AliHLTTPCCAStartHitsFinder.cxx"
-#include "AliHLTTPCCAStartHitsSorter.cxx"
-#include "AliHLTTPCCATrackletConstructor.cxx"
-#include "AliHLTTPCCASliceOutput.cxx"
-
-#include "MemoryAssignmentHelpers.h"
-
-#ifndef HLTCA_STANDALONE
-#include "AliHLTDefinitions.h"
-#include "AliHLTSystem.h"
-#endif
-
-ClassImp( AliHLTTPCCAGPUTracker )
-
-bool AliHLTTPCCAGPUTracker::fgGPUUsed = false;
-
-int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
-{
-       //Find best CUDA device, initialize and allocate memory
-       
-       if (fgGPUUsed)
-       {
-           HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
-           return(1);
-       }
-       fgGPUUsed = 1;
-       fThreadId = GetThread();
-
-       cudaDeviceProp fCudaDeviceProp;
-
-       fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
-
-#ifndef CUDA_DEVICE_EMULATION
-       int count, bestDevice = -1;
-       long long int bestDeviceSpeed = 0, deviceSpeed;
-       if (CudaFailedMsg(cudaGetDeviceCount(&count)))
-       {
-               HLTError("Error getting CUDA Device Count");
-               fgGPUUsed = 0;
-               return(1);
-       }
-       if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
-       for (int i = 0;i < count;i++)
-       {
-               unsigned int free, total;
-               cuInit(0);
-               CUdevice tmpDevice;
-               cuDeviceGet(&tmpDevice, i);
-               CUcontext tmpContext;
-               cuCtxCreate(&tmpContext, 0, tmpDevice);
-               if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
-               cuCtxDestroy(tmpContext);
-               CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i));
-
-               int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;
-
-               if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %d)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
-               deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;
-               if (deviceOK && deviceSpeed > bestDeviceSpeed)
-               {
-                       bestDevice = i;
-                       bestDeviceSpeed = deviceSpeed;
-               }
-       }
-       if (bestDevice == -1)
-       {
-               HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
-               fgGPUUsed = 0;
-               return(1);
-       }
-
-  int cudaDevice;
-  if (forceDeviceID == -1)
-         cudaDevice = bestDevice;
-  else
-         cudaDevice = forceDeviceID;
-#else
-       int cudaDevice = 0;
-#endif
-
-  cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice ); 
-
-  if (fDebugLevel >= 1)
-  {
-         HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
-         HLTInfo("totalGlobalMem = %d", fCudaDeviceProp.totalGlobalMem);
-         HLTInfo("sharedMemPerBlock = %d", fCudaDeviceProp.sharedMemPerBlock);
-         HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
-         HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
-         HLTInfo("memPitch = %d", fCudaDeviceProp.memPitch);
-         HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
-         HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
-         HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
-         HLTInfo("totalConstMem = %d", fCudaDeviceProp.totalConstMem);
-         HLTInfo("major = %d", fCudaDeviceProp.major);
-         HLTInfo("minor = %d", fCudaDeviceProp.minor);
-         HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);
-         HLTInfo("textureAlignment %d= ", fCudaDeviceProp.textureAlignment);
-  }
-
-  if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
-  {
-       HLTError( "Unsupported CUDA Device" );
-       fgGPUUsed = 0;
-       return(1);
-  }
-
-  if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
-  {
-         HLTError("Insufficiant Tracker Object Memory");
-         fgGPUUsed = 0;
-         return(1);
-  }
-  
-  if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
-  {
-         HLTError("Could not set CUDA Device!");
-         fgGPUUsed = 0;
-         return(1);
-  }
-
-  if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
-  {
-         HLTError("Insufficiant Common Memory");
-         cudaThreadExit();
-         fgGPUUsed = 0;
-         return(1);
-  }
-
-  if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
-  {
-         HLTError("Insufficiant Row Memory");
-         cudaThreadExit();
-         fgGPUUsed = 0;
-         return(1);
-  }
-
-  if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
-  {
-         HLTError("CUDA Memory Allocation Error");
-         cudaThreadExit();
-         fgGPUUsed = 0;
-         return(1);
-  }
-  if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
-  int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
-  if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
-  {
-         cudaFree(fGPUMemory);
-         cudaThreadExit();
-         HLTError("Error allocating Page Locked Host Memory");
-         fgGPUUsed = 0;
-         return(1);
-  }
-  if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
-
-  if (fDebugLevel >= 1)
-  {
-         CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
-  }
-
-  fSliceCount = sliceCount;
-  //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
-  fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
-
-  for (int i = 0;i < fgkNSlices;i++)
-  {
-    fSlaveTrackers[i].SetGPUTracker();
-       fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
-       fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
-  }
-
-  fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
-  cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
-  for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
-  {
-       if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
-       {
-           cudaFree(fGPUMemory);
-           cudaFreeHost(fHostLockedMemory);
-           cudaThreadExit();
-           HLTError("Error creating CUDA Stream");
-           fgGPUUsed = 0;
-           return(1);
-       }
-  }
-
-  fCudaInitialized = 1;
-  HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %dd)", cudaDevice, fCudaDeviceProp.name, fThreadId);
-
-#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
-  if (fDebugLevel < 2)
-  {
-         //Do one initial run for Benchmark reasons
-         const int useDebugLevel = fDebugLevel;
-         fDebugLevel = 0;
-         AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
-
-         std::ifstream fin;
-
-         AliHLTTPCCAParam tmpParam;
-         AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
-
-         fin.open("events/settings.dump");
-         int tmpCount;
-         fin >> tmpCount;
-         for (int i = 0;i < sliceCount;i++)
-         {
-               fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
-               tmpParam.ReadSettings(fin);
-               InitializeSliceParam(i, tmpParam);
-         }
-         fin.close();
-
-         fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
-         for (int i = 0;i < sliceCount;i++)
-         {
-               tmpCluster[i].StartReading(i, 0);
-               tmpCluster[i].ReadEvent(fin);
-               tmpCluster[i].FinishReading();
-         }
-         fin.close();
-
-         AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
-         memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
-
-         Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
-         for (int i = 0;i < sliceCount;i++)
-         {
-                 free(tmpOutput[i]);
-                 tmpOutput[i] = NULL;
-                 fSlaveTrackers[i].SetOutputControl(NULL);
-         }
-         delete[] tmpOutput;
-         delete[] tmpCluster;
-         fDebugLevel = useDebugLevel;
-  }
-#endif
-  return(0);
-}
-
-template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment)
-{
-       //Macro to align Pointers.
-       //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
-       //(As long as every single data structure is <= 1 MB)
-
-       size_t adr = (size_t) ptr;
-       if (adr % alignment)
-       {
-               adr += alignment - (adr % alignment);
-       }
-       return((T*) adr);
-}
-
-bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
-{
-       //Check for CUDA Error and in the case of an error display the corresponding error string
-       if (error == cudaSuccess) return(false);
-       HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
-       return(true);
-}
-
-int AliHLTTPCCAGPUTracker::CUDASync(char* state)
-{
-       //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
-
-       if (fDebugLevel == 0) return(0);
-       cudaError cuErr;
-       cuErr = cudaGetLastError();
-       if (cuErr != cudaSuccess)
-       {
-               HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state);
-               return(1);
-       }
-       if (CudaFailedMsg(cudaThreadSynchronize()))
-       {
-               HLTError("CUDA Error while synchronizing (%s)", state);
-               return(1);
-       }
-       if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
-       return(0);
-}
-
-void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
-{
-       //Set Debug Level and Debug output File if applicable
-       fDebugLevel = dwLevel;
-       if (NewOutFile) fOutFile = NewOutFile;
-}
-
-int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
-{
-       //Set a specific GPU Tracker Option
-       {
-               HLTError("Unknown Option: %s", OptionName);
-               return(1);
-       }
-       //No Options used at the moment
-       //return(0);
-}
-
-#ifdef HLTCA_STANDALONE
-void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
-{
-  //Run Performance Query for timer i of slice iSlice
-  if (fDebugLevel >= 1)
-  {
-         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
-  }
-}
-#else
-void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
-#endif
-
-void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
-{
-       //Dump Rowblocks to File
-       if (fDebugLevel >= 4)
-       {
-               *fOutFile << "RowBlock Tracklets" << std::endl;
-       
-               int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
-               int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
-               uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
-               CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
-               CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
-               CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
-               CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
-
-               int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
-               for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
-               {
-                       *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << 
-                               rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
-                       for (int j = 0;j < rowBlockPos[i].x;j++)
-                       {
-                               //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
-                               *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
-                               if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
-                               {
-                                       HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
-                               }
-                               k++;
-                               if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
-                               {
-                                       HLTError("Error, -1 Tracklet found");
-                               }
-                       }
-                       *fOutFile << std::endl << "Phase 2: ";
-                       for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
-                       {
-                               *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
-                       }
-                       *fOutFile << std::endl;
-               }
-
-               if (check)
-               {
-                       *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
-                       for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
-                       {
-                               *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
-                       }
-               }
-
-               free(rowBlockPos);
-               free(rowBlockTracklets);
-               free(blockStartingTracklet);
-       }
-}
-
-__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
-{
-       //Initialize GPU RowBlocks and HitWeights
-       int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
-       int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
-       const int stride = blockDim.x * gridDim.x;
-       int4 i0, i1;
-       i0.x = i0.y = i0.z = i0.w = 0;
-       i1.x = i1.y = i1.z = i1.w = -1;
-       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
-               RowBlockPos[i] = i0;
-       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
-               rowBlockTracklets4[i] = i1;
-       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
-               sliceDataHitWeights4[i] = i0;
-}
-
-int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
-{
-       //Primary reconstruction function
-       cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
-
-       if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
-       
-       if (!fCudaInitialized)
-       {
-           HLTError("GPUTracker not initialized");
-           return(1);
-       }
-       if (sliceCountLocal > fSliceCount)
-       {
-           HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
-           return(1);
-       }
-       if (fThreadId != GetThread())
-       {
-           HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());
-           return(1);
-       }
-
-       if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
-
-       if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
-       {
-               HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
-               return(1);
-       }
-
-       if (fDebugLevel >= 4)
-       {
-               for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
-               {
-                       *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
-               }
-       }
-
-       memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
-
-       if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
-
-#ifdef HLTCA_GPU_TIME_PROFILE
-       unsigned __int64 a, b, c, d;
-       AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
-       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
-#endif
-       
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
-       {
-               //Make this a GPU Tracker
-               fGpuTracker[iSlice].SetGPUTracker();
-               fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
-               fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
-               fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
-
-               //Set Pointers to GPU Memory
-               char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
-
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
-               tmpMem = alignPointer(tmpMem, 1024 * 1024);
-
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
-               tmpMem = alignPointer(tmpMem, 1024 * 1024);
-
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
-               tmpMem = alignPointer(tmpMem, 1024 * 1024);
-
-               if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
-               {
-                       HLTError("Insufficiant Track Memory");
-                       return(1);
-               }
-
-               if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
-               {
-                       HLTError("Insufficiant Global Memory");
-                       return(1);
-               }
-
-               //Initialize Startup Constants
-               *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
-               *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
-               *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
-               fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
-               if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
-               fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
-               fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
-               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
-               fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
-       }
-
-#ifdef HLTCA_GPU_TEXTURE_FETCH
-               cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
-               size_t offset;
-               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
-               {
-                       HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
-                       return(1);
-               }
-               cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
-               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
-               {
-                       HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
-                       return(1);
-               }
-               cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
-               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
-               {
-                       HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
-                       return(1);
-               }
-#endif
-
-       //Copy Tracker Object to GPU Memory
-       if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
-#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-       if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1);
-       CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000));
-#endif
-       CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
-
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
-       {
-               StandalonePerfTime(firstSlice + iSlice, 0);
-
-               //Initialize GPU Slave Tracker
-               if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
-               fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
-#ifdef HLTCA_GPU_TIME_PROFILE
-                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
-#endif
-               fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
-#ifdef HLTCA_GPU_TIME_PROFILE
-                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
-               printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
-#endif
-               if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
-               {
-                       HLTError("Insufficiant Slice Data Memory");
-                       return(1);
-               }
-
-               /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice())
-               {
-                       if (fDebugLevel >= 3) HLTInfo("Slice Empty, not running GPU Tracker");
-                       if (sliceCountLocal == 1)
-                               return(0);
-               }*/
-
-               //Initialize temporary memory where needed
-               if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");           
-               PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
-
-               //Copy Data to GPU Global Memory
-               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
-               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
-               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
-
-               if (fDebugLevel >= 4)
-               {
-                       if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
-                       fSlaveTrackers[firstSlice + iSlice].TrackletMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] );
-                       fSlaveTrackers[firstSlice + iSlice].SetPointersTracklets( HLTCA_GPU_MAX_TRACKLETS );
-                       fSlaveTrackers[firstSlice + iSlice].HitMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100] );
-                       fSlaveTrackers[firstSlice + iSlice].SetPointersHits( pClusterData[iSlice].NumberOfClusters() );
-               }
-               
-               if (CUDASync("Initialization")) return(1);
-               StandalonePerfTime(firstSlice + iSlice, 1);
-
-               if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
-               AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
-
-               if (CUDASync("Neighbours finder")) return 1;
-
-               StandalonePerfTime(firstSlice + iSlice, 2);
-
-               if (fDebugLevel >= 4)
-               {
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
-                       fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
-               }
-
-               if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
-               AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
-               if (CUDASync("Neighbours Cleaner")) return 1;
-
-               StandalonePerfTime(firstSlice + iSlice, 3);
-
-               if (fDebugLevel >= 4)
-               {
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
-                       fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
-               }
-
-               if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
-               AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
-               if (CUDASync("Start Hits Finder")) return 1;
-
-               StandalonePerfTime(firstSlice + iSlice, 4);
-
-               if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter");
-               AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
-               if (CUDASync("Start Hits Sorter")) return 1;
-
-               StandalonePerfTime(firstSlice + iSlice, 5);
-
-               if (fDebugLevel >= 2)
-               {
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
-                       if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
-                       if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
-                       {
-                               HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
-                               return(1);
-                       }
-               }
-
-               if (fDebugLevel >= 4)
-               {
-                       *fOutFile << "Temporary ";
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
-                       fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
-                       uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
-                       CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
-                       *fOutFile << "Start Hits Sort Vector:" << std::endl;
-                       for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
-                       {
-                               *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
-                       }
-                       free(tmpMemory);
-
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
-                       fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
-               }
-
-               StandalonePerfTime(firstSlice + iSlice, 6);
-               
-               fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
-       }
-
-       StandalonePerfTime(firstSlice, 7);
-#ifdef HLTCA_GPU_PREFETCHDATA
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
-       {
-               if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
-               {
-                       HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
-                       return(1);
-               }
-               if (fDebugLevel >= 1)
-               {
-                       static int infoShown = 0;
-                       if (!infoShown)
-                       {
-                               HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
-                               infoShown = 1;
-                       }
-               }
-       }
-#endif
-
-       int nHardCollisions = 0;
-
-RestartTrackletConstructor:
-       if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
-       {
-               AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
-               if (CUDASync("Tracklet Initializer")) return 1;
-               DumpRowBlocks(fSlaveTrackers, iSlice);
-       }
-
-       if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
-       AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
-       if (CUDASync("Tracklet Constructor (new)")) return 1;
-       
-       StandalonePerfTime(firstSlice, 8);
-
-       if (fDebugLevel >= 4)
-       {
-               for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
-               {
-                       DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
-                       if (fDebugLevel >= 5)
-                       {
-                               HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
-                       }
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
-                       fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
-               }
-       }
-
-       int runSlices = 0;
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
-       {
-               if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
-               if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
-               AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
-       }
-       if (CUDASync("Tracklet Selector")) return 1;
-       StandalonePerfTime(firstSlice, 9);
-
-       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0]));
-       for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++)
-       {
-               if (iSliceTmp < sliceCountLocal)
-               {
-                       int iSlice = iSliceTmp;
-                       if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
-                       cudaStreamSynchronize(cudaStreams[iSlice]);
-                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
-                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
-                       if (iSlice + 1 < sliceCountLocal)
-                               CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1]));
-               }
-
-               if (iSliceTmp)
-               {
-                       int iSlice = iSliceTmp - 1;
-                       cudaStreamSynchronize(cudaStreams[iSlice]);
-
-                       if (fDebugLevel >= 4)
-                       {
-                               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
-                               fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
-                               fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
-                       }
-
-                       if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
-                       {
-                               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION && nHardCollisions++ < 10)
-                               {
-                                       HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");
-                                       for (int i = 0;i < sliceCountLocal;i++)
-                                       {
-                                               cudaThreadSynchronize();
-                                               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
-                                               *fSlaveTrackers[firstSlice + i].NTracks() = 0;
-                                               *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
-                                               fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
-                                               CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
-                                               PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
-                                       }
-                                       goto RestartTrackletConstructor;
-                               }
-                               HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
-                               return(1);
-                       }
-                       if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
-
-                       fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
-#ifdef HLTCA_GPU_TIME_PROFILE
-                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
-#endif
-                       fSlaveTrackers[firstSlice + iSlice].WriteOutput();
-#ifdef HLTCA_GPU_TIME_PROFILE
-                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
-                       printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
-#endif
-
-                       if (fDebugLevel >= 4)
-                       {
-                               delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
-                               delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
-                       }
-               }
-       }
-
-       StandalonePerfTime(firstSlice, 10);
-
-       if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
-
-#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-       char* stageAtSync = (char*) malloc(100000000);
-       CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
-       cudaFree(fGpuTracker[0].fStageAtSync);
-
-       FILE* fp = fopen("profile.txt", "w+");
-       FILE* fp2 = fopen("profile.bmp", "w+b");
-       int nEmptySync = 0, fEmpty;
-
-       const int bmpheight = 1000;
-       BITMAPFILEHEADER bmpFH;
-       BITMAPINFOHEADER bmpIH;
-       ZeroMemory(&bmpFH, sizeof(bmpFH));
-       ZeroMemory(&bmpIH, sizeof(bmpIH));
-       
-       bmpFH.bfType = 19778; //"BM"
-       bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
-       bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
-
-       bmpIH.biSize = sizeof(bmpIH);
-       bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
-       bmpIH.biHeight = bmpheight;
-       bmpIH.biPlanes = 1;
-       bmpIH.biBitCount = 32;
-
-       fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
-       fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);  
-
-       for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
-       {
-               fEmpty = 1;
-               for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
-               {
-                       fprintf(fp, "%d\t", stageAtSync[i + j]);
-                       int color = 0;
-                       if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
-                       if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
-                       if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
-                       if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
-                       fwrite(&color, 1, sizeof(int), fp2);
-                       if (j > 0 && j % 32 == 0)
-                       {
-                               color = RGB(255, 255, 255);
-                               fwrite(&color, 1, 4, fp2);
-                       }
-                       if (stageAtSync[i + j]) fEmpty = 0;
-               }
-               fprintf(fp, "\n");
-               if (fEmpty) nEmptySync++;
-               else nEmptySync = 0;
-               //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
-       }
-
-       fclose(fp);
-       fclose(fp2);
-       free(stageAtSync);
-#endif 
-
-       return(0);
-}
-
-int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
-{
-       //Initialize Slice Tracker Parameter for a slave tracker
-       fSlaveTrackers[iSlice].Initialize(param);
-       if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
-       {
-               HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
-               return(1);
-       }
-       return(0);
-}
-
-int AliHLTTPCCAGPUTracker::ExitGPU()
-{
-       //Uninitialize CUDA
-       cudaThreadSynchronize();
-       if (fGPUMemory)
-       {
-               cudaFree(fGPUMemory);
-               fGPUMemory = NULL;
-       }
-       if (fHostLockedMemory)
-       {
-               for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
-               {
-                       cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
-               }
-               free(fpCudaStreams);
-               fGpuTracker = NULL;
-               cudaFreeHost(fHostLockedMemory);
-       }
-
-       if (CudaFailedMsg(cudaThreadExit()))
-       {
-               HLTError("Could not uninitialize GPU");
-               return(1);
-       }
-       HLTInfo("CUDA Uninitialized");
-       fgGPUUsed = false;
-       fCudaInitialized = 0;
-       return(0);
-}
-
-void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
-{
-       fOutputControl = val;
-       for (int i = 0;i < fgkNSlices;i++)
-       {
-               fSlaveTrackers[i].SetOutputControl(val);
-       }
-}
-
-int AliHLTTPCCAGPUTracker::GetThread()
-{
-#ifdef R__WIN32
-       return((int) (size_t) GetCurrentThread());
-#else
-       return((int) syscall (SYS_gettid));
-#endif
-}
-
-#endif
+// **************************************************************************\r
+// This file is property of and copyright by the ALICE HLT Project          *\r
+// ALICE Experiment at CERN, All rights reserved.                           *\r
+//                                                                          *\r
+// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *\r
+//                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *\r
+//                                     David Rohr <drohr@kip.uni-heidelberg.de>                                *\r
+//                  for The ALICE HLT Project.                              *\r
+//                                                                          *\r
+// Permission to use, copy, modify and distribute this software and its     *\r
+// documentation strictly for non-commercial purposes is hereby granted     *\r
+// without fee, provided that the above copyright notice appears in all     *\r
+// copies and that both the copyright notice and this permission notice     *\r
+// appear in the supporting documentation. The authors make no claims       *\r
+// about the suitability of this software for any purpose. It is            *\r
+// provided "as is" without express or implied warranty.                    *\r
+//                                                                          *\r
+//***************************************************************************\r
+\r
+#include "AliHLTTPCCAGPUTrackerNVCC.h"\r
+\r
+#ifdef HLTCA_GPUCODE\r
+#include <cuda.h>\r
+#include <sm_11_atomic_functions.h>\r
+#include <sm_12_atomic_functions.h>\r
+#endif\r
+\r
+#ifdef R__WIN32\r
+#else\r
+#include <sys/syscall.h>\r
+#include <semaphore.h>\r
+#include <fcntl.h>\r
+#endif\r
+\r
+#include "AliHLTTPCCADef.h"\r
+#include "AliHLTTPCCAGPUConfig.h"\r
+\r
+\r
+#include <iostream>\r
+\r
+//Disable assertions since they produce errors in GPU Code\r
+#ifdef assert\r
+#undef assert\r
+#endif\r
+#define assert(param)\r
+\r
+__constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];\r
+#ifdef HLTCA_GPU_TEXTURE_FETCH\r
+texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;\r
+texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;\r
+texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;\r
+#endif\r
+\r
+//Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code\r
+#include "AliHLTTPCCATrackParam.cxx"\r
+#include "AliHLTTPCCATrack.cxx" \r
+\r
+#include "AliHLTTPCCAHitArea.cxx"\r
+#include "AliHLTTPCCAGrid.cxx"\r
+#include "AliHLTTPCCARow.cxx"\r
+#include "AliHLTTPCCAParam.cxx"\r
+#include "AliHLTTPCCATracker.cxx"\r
+\r
+#include "AliHLTTPCCAProcess.h"\r
+\r
+#include "AliHLTTPCCATrackletSelector.cxx"\r
+#include "AliHLTTPCCANeighboursFinder.cxx"\r
+#include "AliHLTTPCCANeighboursCleaner.cxx"\r
+#include "AliHLTTPCCAStartHitsFinder.cxx"\r
+#include "AliHLTTPCCAStartHitsSorter.cxx"\r
+#include "AliHLTTPCCATrackletConstructor.cxx"\r
+\r
+#include "MemoryAssignmentHelpers.h"\r
+\r
+#ifndef HLTCA_STANDALONE\r
+#include "AliHLTDefinitions.h"\r
+#include "AliHLTSystem.h"\r
+#endif\r
+\r
+ClassImp( AliHLTTPCCAGPUTrackerNVCC )\r
+\r
+bool AliHLTTPCCAGPUTrackerNVCC::fgGPUUsed = false;\r
+\r
+#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"\r
+\r
+AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :\r
+       fGpuTracker(NULL),\r
+       fGPUMemory(NULL),\r
+       fHostLockedMemory(NULL),\r
+       fDebugLevel(0),\r
+       fDebugMask(0xFFFFFFFF),\r
+       fOutFile(NULL),\r
+       fGPUMemSize(0),\r
+       fpCudaStreams(NULL),\r
+       fSliceCount(0),\r
+       fOutputControl(NULL),\r
+       fThreadId(0),\r
+       fCudaInitialized(0),\r
+       fPPMode(0)\r
+       {};\r
+\r
+AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() {};\r
+\r
+void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)\r
+{\r
+       //Release the global named semaphore that locks GPU Initialization\r
+#ifdef R__WIN32\r
+       HANDLE* h = (HANDLE*) sem;\r
+       ReleaseSemaphore(*h, 1, NULL);\r
+       CloseHandle(*h);\r
+       delete h;\r
+#else\r
+       sem_t* pSem = (sem_t*) sem;\r
+       sem_post(pSem);\r
+       sem_unlink(SemLockName);\r
+#endif\r
+}\r
+\r
+int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)\r
+{\r
+       //Check constants for correct memory sizes\r
+  if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)\r
+  {\r
+         HLTError("Insufficiant Tracker Object Memory");\r
+         return(1);\r
+  }\r
+\r
+  if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)\r
+  {\r
+         HLTError("Insufficiant Common Memory");\r
+         return(1);\r
+  }\r
+\r
+  if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)\r
+  {\r
+         HLTError("Insufficiant Row Memory");\r
+         return(1);\r
+  }\r
+\r
+  if (fDebugLevel >= 3)\r
+  {\r
+         HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, fgkNSlices * AliHLTTPCCATracker::CommonMemorySize(), HLTCA_GPU_COMMON_MEMORY, fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), HLTCA_GPU_ROWS_MEMORY);\r
+  }\r
+  return(0);\r
+}\r
+\r
+int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)\r
+{\r
+       //Find best CUDA device, initialize and allocate memory\r
+\r
+       if (CheckMemorySizes(sliceCount)) return(1);\r
+\r
+#ifdef R__WIN32\r
+       HANDLE* semLock = new HANDLE;\r
+       *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);\r
+       if (*semLock == NULL)\r
+       {\r
+               HLTError("Error creating GPUInit Semaphore");\r
+               return(1);\r
+       }\r
+       WaitForSingleObject(*semLock, INFINITE);\r
+#else\r
+       sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);\r
+       if (semLock == SEM_FAILED)\r
+       {\r
+               HLTError("Error creating GPUInit Semaphore");\r
+               return(1);\r
+       }\r
+       sem_wait(semLock);\r
+#endif\r
+\r
+       if (fgGPUUsed)\r
+       {\r
+           HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");\r
+               ReleaseGlobalLock(semLock);\r
+           return(1);\r
+       }\r
+       fgGPUUsed = 1;\r
+       fThreadId = GetThread();\r
+\r
+       cudaDeviceProp fCudaDeviceProp;\r
+\r
+       fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);\r
+\r
+#ifndef CUDA_DEVICE_EMULATION\r
+       int count, bestDevice = -1;\r
+       long long int bestDeviceSpeed = 0, deviceSpeed;\r
+       if (CudaFailedMsg(cudaGetDeviceCount(&count)))\r
+       {\r
+               HLTError("Error getting CUDA Device Count");\r
+               fgGPUUsed = 0;\r
+               ReleaseGlobalLock(semLock);\r
+               return(1);\r
+       }\r
+       if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");\r
+       for (int i = 0;i < count;i++)\r
+       {\r
+               unsigned int free, total;\r
+               cuInit(0);\r
+               CUdevice tmpDevice;\r
+               cuDeviceGet(&tmpDevice, i);\r
+               CUcontext tmpContext;\r
+               cuCtxCreate(&tmpContext, 0, tmpDevice);\r
+               if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";\r
+               cuCtxDestroy(tmpContext);\r
+               if (CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;\r
+\r
+               int deviceOK = sliceCount <= fCudaDeviceProp.multiProcessorCount && fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;\r
+\r
+               if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");\r
+               deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;\r
+               if (deviceOK && deviceSpeed > bestDeviceSpeed)\r
+               {\r
+                       bestDevice = i;\r
+                       bestDeviceSpeed = deviceSpeed;\r
+               }\r
+       }\r
+       if (bestDevice == -1)\r
+       {\r
+               HLTWarning("No CUDA Device available, aborting CUDA Initialisation");\r
+               HLTInfo("Requiring Revision 1.3, Mem: %d", fGPUMemSize);\r
+               fgGPUUsed = 0;\r
+               ReleaseGlobalLock(semLock);\r
+               return(1);\r
+       }\r
+\r
+  int cudaDevice;\r
+  if (forceDeviceID == -1)\r
+         cudaDevice = bestDevice;\r
+  else\r
+         cudaDevice = forceDeviceID;\r
+#else\r
+       int cudaDevice = 0;\r
+#endif\r
+\r
+  cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice ); \r
+\r
+  if (fDebugLevel >= 1)\r
+  {\r
+         HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);\r
+         HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);\r
+         HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);\r
+         HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);\r
+         HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);\r
+         HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);\r
+         HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);\r
+         HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);\r
+         HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);\r
+         HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);\r
+         HLTInfo("major = %d", fCudaDeviceProp.major);\r
+         HLTInfo("minor = %d", fCudaDeviceProp.minor);\r
+         HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);\r
+         HLTInfo("textureAlignment %lld= ", (unsigned long long int) fCudaDeviceProp.textureAlignment);\r
+  }\r
+\r
+  if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))\r
+  {\r
+       HLTError( "Unsupported CUDA Device" );\r
+       fgGPUUsed = 0;\r
+       ReleaseGlobalLock(semLock);\r
+       return(1);\r
+  }\r
+\r
+  if (CudaFailedMsg(cudaSetDevice(cudaDevice)))\r
+  {\r
+         HLTError("Could not set CUDA Device!");\r
+         fgGPUUsed = 0;\r
+         ReleaseGlobalLock(semLock);\r
+         return(1);\r
+  }\r
+\r
+  if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))\r
+  {\r
+         HLTError("CUDA Memory Allocation Error");\r
+         cudaThreadExit();\r
+         fgGPUUsed = 0;\r
+         ReleaseGlobalLock(semLock);\r
+         return(1);\r
+  }\r
+  ReleaseGlobalLock(semLock);\r
+  if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);\r
+  int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;\r
+  if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))\r
+  {\r
+         cudaFree(fGPUMemory);\r
+         cudaThreadExit();\r
+         HLTError("Error allocating Page Locked Host Memory");\r
+         fgGPUUsed = 0;\r
+         return(1);\r
+  }\r
+  if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);\r
+\r
+  if (fDebugLevel >= 1)\r
+  {\r
+         CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));\r
+  }\r
+\r
+  fSliceCount = sliceCount;\r
+  //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory\r
+  fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);\r
+\r
+  for (int i = 0;i < fgkNSlices;i++)\r
+  {\r
+    fSlaveTrackers[i].SetGPUTracker();\r
+       fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));\r
+       fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));\r
+  }\r
+\r
+  fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));\r
+  cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;\r
+  for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)\r
+  {\r
+       if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))\r
+       {\r
+           cudaFree(fGPUMemory);\r
+           cudaFreeHost(fHostLockedMemory);\r
+           cudaThreadExit();\r
+           HLTError("Error creating CUDA Stream");\r
+           fgGPUUsed = 0;\r
+           return(1);\r
+       }\r
+  }\r
+\r
+  fCudaInitialized = 1;\r
+  HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", cudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount);\r
+\r
+#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)\r
+  if (fDebugLevel < 2)\r
+  {\r
+         //Do one initial run for Benchmark reasons\r
+         const int useDebugLevel = fDebugLevel;\r
+         fDebugLevel = 0;\r
+         AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];\r
+\r
+         std::ifstream fin;\r
+\r
+         AliHLTTPCCAParam tmpParam;\r
+         AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;\r
+\r
+         fin.open("events/settings.dump");\r
+         int tmpCount;\r
+         fin >> tmpCount;\r
+         for (int i = 0;i < sliceCount;i++)\r
+         {\r
+               fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);\r
+               tmpParam.ReadSettings(fin);\r
+               InitializeSliceParam(i, tmpParam);\r
+         }\r
+         fin.close();\r
+\r
+         fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);\r
+         for (int i = 0;i < sliceCount;i++)\r
+         {\r
+               tmpCluster[i].StartReading(i, 0);\r
+               tmpCluster[i].ReadEvent(fin);\r
+               tmpCluster[i].FinishReading();\r
+         }\r
+         fin.close();\r
+\r
+         AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];\r
+         memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));\r
+\r
+         Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);\r
+         for (int i = 0;i < sliceCount;i++)\r
+         {\r
+                 free(tmpOutput[i]);\r
+                 tmpOutput[i] = NULL;\r
+                 fSlaveTrackers[i].SetOutputControl(NULL);\r
+         }\r
+         delete[] tmpOutput;\r
+         delete[] tmpCluster;\r
+         fDebugLevel = useDebugLevel;\r
+  }\r
+#endif\r
+  return(0);\r
+}\r
+\r
+template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)\r
+{\r
+       //Macro to align Pointers.\r
+       //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker\r
+       //(As long as every single data structure is <= 1 MB)\r
+\r
+       size_t adr = (size_t) ptr;\r
+       if (adr % alignment)\r
+       {\r
+               adr += alignment - (adr % alignment);\r
+       }\r
+       return((T*) adr);\r
+}\r
+\r
+bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsg(cudaError_t error)\r
+{\r
+       //Check for CUDA Error and in the case of an error display the corresponding error string\r
+       if (error == cudaSuccess) return(false);\r
+       HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));\r
+       return(true);\r
+}\r
+\r
+int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice)\r
+{\r
+       //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards\r
+\r
+       if (fDebugLevel == 0) return(0);\r
+       cudaError cuErr;\r
+       cuErr = cudaGetLastError();\r
+       if (cuErr != cudaSuccess)\r
+       {\r
+               HLTError("Cuda Error %s while running kernel (%s) (Slice %d; %d/%d)", cudaGetErrorString(cuErr), state, sliceLocal, slice, fgkNSlices);\r
+               return(1);\r
+       }\r
+       if (CudaFailedMsg(cudaThreadSynchronize()))\r
+       {\r
+               HLTError("CUDA Error while synchronizing (%s) (Slice %d; %d/%d)", state, sliceLocal, slice, fgkNSlices);\r
+               return(1);\r
+       }\r
+       if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");\r
+       return(0);\r
+}\r
+\r
+void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)\r
+{\r
+       //Set Debug Level and Debug output File if applicable\r
+       fDebugLevel = dwLevel;\r
+       if (NewOutFile) fOutFile = NewOutFile;\r
+}\r
+\r
+int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int OptionValue)\r
+{\r
+       //Set a specific GPU Tracker Option\r
+       if (strcmp(OptionName, "PPMode") == 0)\r
+       {\r
+               fPPMode = OptionValue;\r
+       }\r
+       else if (strcmp(OptionName, "DebugMask") == 0)\r
+       {\r
+               fDebugMask = OptionValue;\r
+       }\r
+       else\r
+       {\r
+               HLTError("Unknown Option: %s", OptionName);\r
+               return(1);\r
+       }\r
+       return(0);\r
+}\r
+\r
+#ifdef HLTCA_STANDALONE\r
+void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)\r
+{\r
+  //Run Performance Query for timer i of slice iSlice\r
+  if (fDebugLevel >= 1)\r
+  {\r
+         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));\r
+  }\r
+}\r
+#else\r
+void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}\r
+#endif\r
+\r
+void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)\r
+{\r
+       //Dump Rowblocks to File\r
+       if (fDebugLevel >= 4)\r
+       {\r
+               *fOutFile << "RowBlock Tracklets (Slice " << tracker[iSlice].Param().ISlice() << " (" << iSlice << " of reco))";\r
+               *fOutFile << " after Tracklet Reconstruction";\r
+               *fOutFile << std::endl;\r
+       \r
+               int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);\r
+               int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);\r
+               uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);\r
+               CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));\r
+               CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));\r
+               CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));\r
+               CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
+\r
+               int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;\r
+               for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)\r
+               {\r
+                       *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << \r
+                               rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";\r
+                       for (int j = 0;j < rowBlockPos[i].x;j++)\r
+                       {\r
+                               //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!\r
+                               *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";\r
+#ifdef HLTCA_GPU_SCHED_FIXED_START\r
+                               if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)\r
+                               {\r
+                                       HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);\r
+                               }\r
+#endif //HLTCA_GPU_SCHED_FIXED_START\r
+                               k++;\r
+                               if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)\r
+                               {\r
+                                       HLTError("Error, -1 Tracklet found");\r
+                               }\r
+                       }\r
+                       *fOutFile << std::endl << "Phase 2: ";\r
+                       for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)\r
+                       {\r
+                               *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";\r
+                       }\r
+                       *fOutFile << std::endl;\r
+               }\r
+\r
+               if (check)\r
+               {\r
+                       *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;\r
+                       for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)\r
+                       {\r
+                               *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;\r
+                       }\r
+               }\r
+\r
+               free(rowBlockPos);\r
+               free(rowBlockTracklets);\r
+               free(blockStartingTracklet);\r
+       }\r
+}\r
+\r
+__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)\r
+{\r
+       //Initialize GPU RowBlocks and HitWeights\r
+       int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;\r
+       int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;\r
+       const int stride = blockDim.x * gridDim.x;\r
+       int4 i0, i1;\r
+       i0.x = i0.y = i0.z = i0.w = 0;\r
+       i1.x = i1.y = i1.z = i1.w = -1;\r
+       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)\r
+               RowBlockPos[i] = i0;\r
+       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)\r
+               rowBlockTracklets4[i] = i1;\r
+       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)\r
+               sliceDataHitWeights4[i] = i0;\r
+}\r
+\r
+int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)\r
+{\r
+       //Primary reconstruction function\r
+\r
+       cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;\r
+\r
+       if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;\r
+       \r
+       if (!fCudaInitialized)\r
+       {\r
+           HLTError("GPUTracker not initialized");\r
+           return(1);\r
+       }\r
+       if (sliceCountLocal > fSliceCount)\r
+       {\r
+           HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);\r
+           return(1);\r
+       }\r
+       if (fThreadId != GetThread())\r
+       {\r
+           HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());\r
+           return(1);\r
+       }\r
+\r
+       if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);\r
+\r
+       if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)\r
+       {\r
+               HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));\r
+               return(1);\r
+       }\r
+\r
+       if (fPPMode) return(ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal));\r
+\r
+       memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);\r
+\r
+       if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");\r
+\r
+#ifdef HLTCA_GPU_TIME_PROFILE\r
+       unsigned long long int a, b, c, d;\r
+       AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);\r
+       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);\r
+#endif\r
+       \r
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
+       {\r
+               //Make this a GPU Tracker\r
+               fGpuTracker[iSlice].SetGPUTracker();\r
+               fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));\r
+               fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));\r
+               fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);\r
+\r
+               //Set Pointers to GPU Memory\r
+               char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);\r
+\r
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");\r
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());\r
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
+\r
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");\r
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);\r
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
+\r
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");\r
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());\r
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
+\r
+               if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)\r
+               {\r
+                       HLTError("Insufficiant Track Memory");\r
+                       return(1);\r
+               }\r
+\r
+               if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)\r
+               {\r
+                       HLTError("Insufficiant Global Memory");\r
+                       return(1);\r
+               }\r
+\r
+               if (fDebugLevel >= 3)\r
+               {\r
+                       HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice), HLTCA_GPU_GLOBAL_MEMORY, fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY);\r
+               }\r
+\r
+               //Initialize Startup Constants\r
+               *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;\r
+               *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;\r
+               *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;\r
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;\r
+               if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);\r
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;\r
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;\r
+               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;\r
+               fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());\r
+       }\r
+\r
+#ifdef HLTCA_GPU_TEXTURE_FETCH\r
+               cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();\r
+               size_t offset;\r
+               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
+               {\r
+                       HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);\r
+                       return(1);\r
+               }\r
+               cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();\r
+               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
+               {\r
+                       HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);\r
+                       return(1);\r
+               }\r
+               cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();\r
+               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
+               {\r
+                       HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);\r
+                       return(1);\r
+               }\r
+#endif\r
+\r
+       //Copy Tracker Object to GPU Memory\r
+       if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");\r
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE\r
+       char* tmpMem;\r
+       if (CudaFailedMsg(cudaMalloc(&tmpMem, 100000000))) return(1);\r
+       fGpuTracker[0].fStageAtSync = tmpMem;\r
+       CudaFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));\r
+#endif\r
+       CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));\r
+\r
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
+       {\r
+               StandalonePerfTime(firstSlice + iSlice, 0);\r
+\r
+               //Initialize GPU Slave Tracker\r
+               if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");\r
+               fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));\r
+#ifdef HLTCA_GPU_TIME_PROFILE\r
+                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
+#endif\r
+               fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);\r
+\r
+                 if (fDebugLevel >= 4)\r
+                 {\r
+                         *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl;\r
+\r
+                         if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);\r
+                 }\r
+\r
+#ifdef HLTCA_GPU_TIME_PROFILE\r
+                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
+               printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
+#endif\r
+               if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)\r
+               {\r
+                       HLTError("Insufficiant Slice Data Memory");\r
+                       return(1);\r
+               }\r
+\r
+               if (fDebugLevel >= 3)\r
+               {\r
+                       HLTInfo("GPU Slice Data Memory Used: %d/%d", fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY);\r
+               }\r
+\r
+               //Initialize temporary memory where needed\r
+               if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");           \r
+               PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());\r
+\r
+               //Copy Data to GPU Global Memory\r
+               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
+               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
+               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
+\r
+               if (fDebugLevel >= 4)\r
+               {\r
+                       if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");\r
+                       fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS);\r
+                       fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );\r
+               }\r
+               \r
+               if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);\r
+               StandalonePerfTime(firstSlice + iSlice, 1);\r
+\r
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);\r
+               AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
+\r
+               if (CUDASync("Neighbours finder", iSlice, iSlice + firstSlice)) return 1;\r
+\r
+               StandalonePerfTime(firstSlice + iSlice, 2);\r
+\r
+               if (fDebugLevel >= 4)\r
+               {\r
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));\r
+                       if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);\r
+               }\r
+\r
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);\r
+               AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
+               if (CUDASync("Neighbours Cleaner", iSlice, iSlice + firstSlice)) return 1;\r
+\r
+               StandalonePerfTime(firstSlice + iSlice, 3);\r
+\r
+               if (fDebugLevel >= 4)\r
+               {\r
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));\r
+                       if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);\r
+               }\r
+\r
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);\r
+               AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
+               if (CUDASync("Start Hits Finder", iSlice, iSlice + firstSlice)) return 1;\r
+\r
+               StandalonePerfTime(firstSlice + iSlice, 4);\r
+\r
+               if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);\r
+               AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
+               if (CUDASync("Start Hits Sorter", iSlice, iSlice + firstSlice)) return 1;\r
+\r
+               StandalonePerfTime(firstSlice + iSlice, 5);\r
+\r
+               if (fDebugLevel >= 2)\r
+               {\r
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
+                       if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);\r
+                       if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)\r
+                       {\r
+                               HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");\r
+                               return(1);\r
+                       }\r
+               }\r
+\r
+               if (fDebugLevel >= 4)\r
+               {\r
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));\r
+                       if (fDebugMask & 8)\r
+                       {\r
+                               *fOutFile << "Temporary ";\r
+                               fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);\r
+                       }\r
+                       uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());\r
+                       CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));\r
+                       if (fDebugMask & 16)\r
+                       {\r
+                               *fOutFile << "Start Hits Sort Vector:" << std::endl;\r
+                               for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)\r
+                               {\r
+                                       *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;\r
+                               }\r
+                       }\r
+                       free(tmpMemory);\r
+\r
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));\r
+                       if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);\r
+               }\r
+\r
+               StandalonePerfTime(firstSlice + iSlice, 6);\r
+               \r
+               fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());\r
+       }\r
+\r
+       StandalonePerfTime(firstSlice, 7);\r
+#ifdef HLTCA_GPU_PREFETCHDATA\r
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
+       {\r
+               if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))\r
+               {\r
+                       HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));\r
+                       return(1);\r
+               }\r
+               if (fDebugLevel >= 1)\r
+               {\r
+                       static int infoShown = 0;\r
+                       if (!infoShown)\r
+                       {\r
+                               HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));\r
+                               infoShown = 1;\r
+                       }\r
+               }\r
+       }\r
+#endif\r
+\r
+       int nHardCollisions = 0;\r
+\r
+RestartTrackletConstructor:\r
+       if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");\r
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
+       {\r
+               AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);\r
+               if (CUDASync("Tracklet Initializer", iSlice, iSlice + firstSlice)) return 1;\r
+               if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice);\r
+       }\r
+\r
+       if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");\r
+       AliHLTTPCCATrackletConstructorGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();\r
+       if (CUDASync("Tracklet Constructor", 0, firstSlice)) return 1;\r
+       \r
+       StandalonePerfTime(firstSlice, 8);\r
+\r
+       if (fDebugLevel >= 4)\r
+       {\r
+               for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
+               {\r
+                       if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);\r
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
+                       if (fDebugLevel >= 5)\r
+                       {\r
+                               HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());\r
+                       }\r
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));\r
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));\r
+                       if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);\r
+               }\r
+       }\r
+\r
+       int runSlices = 0;\r
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)\r
+       {\r
+               if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;\r
+               if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);\r
+               AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));\r
+               if (CUDASync("Tracklet Selector", iSlice, iSlice + firstSlice)) return 1;\r
+       }\r
+       StandalonePerfTime(firstSlice, 9);\r
+\r
+       int tmpSlice = 0, tmpSlice2 = 0;\r
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
+       {\r
+               if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");\r
+\r
+               while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == CUDA_SUCCESS))\r
+               {\r
+                       if (CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice]))) return(1);\r
+                       tmpSlice++;\r
+               }\r
+\r
+               while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == CUDA_SUCCESS)\r
+               {\r
+                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));\r
+                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));\r
+                       tmpSlice2++;\r
+               }\r
+\r
+               cudaStreamSynchronize(cudaStreams[iSlice]);\r
+\r
+               if (fDebugLevel >= 4)\r
+               {\r
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));\r
+                       if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);\r
+                       if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);\r
+               }\r
+\r
+               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)\r
+               {\r
+                       if ((fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION || fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_WRONG_ROW)&& nHardCollisions++ < 10)\r
+                       {\r
+                               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION)\r
+                               {\r
+                                       HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");\r
+                               }\r
+                               else\r
+                               {\r
+                                       HLTWarning("Tracklet Constructor returned invalid row");\r
+                               }\r
+                               for (int i = 0;i < sliceCountLocal;i++)\r
+                               {\r
+                                       cudaThreadSynchronize();\r
+                                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
+                                       *fSlaveTrackers[firstSlice + i].NTracks() = 0;\r
+                                       *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;\r
+                                       fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;\r
+                                       CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));\r
+                                       PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());\r
+                               }\r
+                               goto RestartTrackletConstructor;\r
+                       }\r
+                       HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);\r
+                       return(1);\r
+               }\r
+               if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());\r
+\r
+               fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);\r
+#ifdef HLTCA_GPU_TIME_PROFILE\r
+               AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
+#endif\r
+               fSlaveTrackers[firstSlice + iSlice].WriteOutput();\r
+#ifdef HLTCA_GPU_TIME_PROFILE\r
+               AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
+               printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
+#endif\r
+\r
+               if (fDebugLevel >= 4)\r
+               {\r
+                       delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();\r
+                       delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();\r
+               }\r
+       }\r
+\r
+       StandalonePerfTime(firstSlice, 10);\r
+\r
+       if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");\r
+\r
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE\r
+       char* stageAtSync = (char*) malloc(100000000);\r
+       CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));\r
+       cudaFree(fGpuTracker[0].StageAtSync());\r
+\r
+       FILE* fp = fopen("profile.txt", "w+");\r
+       FILE* fp2 = fopen("profile.bmp", "w+b");\r
+       int nEmptySync = 0, fEmpty;\r
+\r
+       const int bmpheight = 1000;\r
+       BITMAPFILEHEADER bmpFH;\r
+       BITMAPINFOHEADER bmpIH;\r
+       ZeroMemory(&bmpFH, sizeof(bmpFH));\r
+       ZeroMemory(&bmpIH, sizeof(bmpIH));\r
+       \r
+       bmpFH.bfType = 19778; //"BM"\r
+       bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;\r
+       bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);\r
+\r
+       bmpIH.biSize = sizeof(bmpIH);\r
+       bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;\r
+       bmpIH.biHeight = bmpheight;\r
+       bmpIH.biPlanes = 1;\r
+       bmpIH.biBitCount = 32;\r
+\r
+       fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);\r
+       fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);  \r
+\r
+       for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)\r
+       {\r
+               fEmpty = 1;\r
+               for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)\r
+               {\r
+                       fprintf(fp, "%d\t", stageAtSync[i + j]);\r
+                       int color = 0;\r
+                       if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);\r
+                       if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);\r
+                       if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);\r
+                       if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);\r
+                       fwrite(&color, 1, sizeof(int), fp2);\r
+                       if (j > 0 && j % 32 == 0)\r
+                       {\r
+                               color = RGB(255, 255, 255);\r
+                               fwrite(&color, 1, 4, fp2);\r
+                       }\r
+                       if (stageAtSync[i + j]) fEmpty = 0;\r
+               }\r
+               fprintf(fp, "\n");\r
+               if (fEmpty) nEmptySync++;\r
+               else nEmptySync = 0;\r
+               //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;\r
+       }\r
+\r
+       fclose(fp);\r
+       fclose(fp2);\r
+       free(stageAtSync);\r
+#endif \r
+\r
+       return(0);\r
+}\r
+\r
+__global__ void ClearPPHitWeights(int sliceCount)\r
+{\r
+       //Clear HitWeights\r
+       \r
+       for (int k = 0;k < sliceCount;k++)\r
+       {\r
+               AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k];\r
+               int4* const pHitWeights = (int4*) tracker.Data().HitWeights();\r
+               const int dwCount = tracker.Data().NumberOfHitsPlusAlign();\r
+               const int stride = blockDim.x * gridDim.x;\r
+               int4 i0;\r
+               i0.x = i0.y = i0.z = i0.w = 0;\r
+       \r
+               for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < dwCount * sizeof(int) / sizeof(int4);i += stride)\r
+               {\r
+                       pHitWeights[i] = i0;\r
+               }\r
+       }\r
+}\r
+\r
+int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)\r
+{\r
+       //Primary reconstruction function for small events (PP)\r
+\r
+       memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);\r
+\r
+       if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");\r
+\r
+#ifdef HLTCA_GPU_TIME_PROFILE\r
+       unsigned long long int a, b, c, d;\r
+       AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);\r
+       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);\r
+#endif\r
+\r
+       char* tmpSliceMemHost = (char*) SliceDataMemory(fHostLockedMemory, 0);\r
+       char* tmpSliceMemGpu = (char*) SliceDataMemory(fGPUMemory, 0);\r
+\r
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
+       {\r
+               StandalonePerfTime(firstSlice + iSlice, 0);\r
+\r
+               //Initialize GPU Slave Tracker\r
+               if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");\r
+               fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(tmpSliceMemHost, RowMemory(fHostLockedMemory, firstSlice + iSlice));\r
+#ifdef HLTCA_GPU_TIME_PROFILE\r
+                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
+#endif\r
+               fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);\r
+#ifdef HLTCA_GPU_TIME_PROFILE\r
+                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
+               printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
+#endif\r
+               if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)\r
+               {\r
+                       HLTError("Insufficiant Slice Data Memory");\r
+                       return(1);\r
+               }\r
+\r
+               //Make this a GPU Tracker\r
+               fGpuTracker[iSlice].SetGPUTracker();\r
+               fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));\r
+\r
+\r
+               fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice));\r
+               fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);\r
+\r
+               tmpSliceMemHost += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();\r
+               tmpSliceMemHost = alignPointer(tmpSliceMemHost, 64 * 1024);\r
+               tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();\r
+               tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024);\r
+\r
+\r
+               //Set Pointers to GPU Memory\r
+               char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);\r
+\r
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");\r
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());\r
+               tmpMem = alignPointer(tmpMem, 64 * 1024);\r
+\r
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");\r
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);\r
+               tmpMem = alignPointer(tmpMem, 64 * 1024);\r
+\r
+               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");\r
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());\r
+               tmpMem = alignPointer(tmpMem, 64 * 1024);\r
+\r
+               if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)\r
+               {\r
+                       HLTError("Insufficiant Track Memory");\r
+                       return(1);\r
+               }\r
+\r
+               if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)\r
+               {\r
+                       HLTError("Insufficiant Global Memory");\r
+                       return(1);\r
+               }\r
+\r
+               //Initialize Startup Constants\r
+               *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;\r
+               *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;\r
+               *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;\r
+               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;\r
+\r
+               fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());\r
+\r
+               if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);\r
+               StandalonePerfTime(firstSlice + iSlice, 1);\r
+       }\r
+\r
+#ifdef HLTCA_GPU_TEXTURE_FETCH\r
+               cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();\r
+               size_t offset;\r
+               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
+               {\r
+                       HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);\r
+                       return(1);\r
+               }\r
+               cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();\r
+               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
+               {\r
+                       HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);\r
+                       return(1);\r
+               }\r
+               cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();\r
+               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
+               {\r
+                       HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);\r
+                       return(1);\r
+               }\r
+#endif\r
+\r
+       //Copy Tracker Object to GPU Memory\r
+       if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");\r
+       CudaFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));\r
+\r
+       //Copy Data to GPU Global Memory\r
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
+       {\r
+               CudaFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));\r
+               //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize());\r
+       }\r
+       //CudaFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));\r
+       //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0)));\r
+       CudaFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));\r
+       CudaFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));\r
+\r
+       if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");\r
+       AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows());\r
+       if (CUDASync("Neighbours finder", 0, firstSlice)) return 1;\r
+       StandalonePerfTime(firstSlice, 2);\r
+       if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");\r
+       AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2);\r
+       if (CUDASync("Neighbours Cleaner", 0, firstSlice)) return 1;\r
+       StandalonePerfTime(firstSlice, 3);\r
+       if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");\r
+       AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6);\r
+       if (CUDASync("Start Hits Finder", 0, firstSlice)) return 1;\r
+       StandalonePerfTime(firstSlice, 4);\r
+\r
+       ClearPPHitWeights <<<30, 256>>>(sliceCountLocal);\r
+       if (CUDASync("Clear Hit Weights", 0, firstSlice)) return 1;\r
+\r
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
+       {\r
+               fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());\r
+       }\r
+\r
+       StandalonePerfTime(firstSlice, 7);\r
+\r
+       if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");\r
+       AliHLTTPCCATrackletConstructorGPUPP<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal);\r
+       if (CUDASync("Tracklet Constructor PP", 0, firstSlice)) return 1;\r
+       \r
+       StandalonePerfTime(firstSlice, 8);\r
+\r
+       AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal);\r
+       if (CUDASync("Tracklet Selector", 0, firstSlice)) return 1;\r
+       StandalonePerfTime(firstSlice, 9);\r
+\r
+       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));\r
+\r
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
+       {\r
+               if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");\r
+\r
+               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));\r
+               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));\r
+\r
+               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)\r
+               {\r
+                       HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);\r
+                       return(1);\r
+               }\r
+               if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());\r
+\r
+               fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);\r
+#ifdef HLTCA_GPU_TIME_PROFILE\r
+               AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
+#endif\r
+               fSlaveTrackers[firstSlice + iSlice].WriteOutput();\r
+#ifdef HLTCA_GPU_TIME_PROFILE\r
+               AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
+               printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
+#endif\r
+       }\r
+\r
+       StandalonePerfTime(firstSlice, 10);\r
+\r
+       if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");\r
+\r
+       return(0);\r
+}\r
+\r
+int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)\r
+{\r
+       //Initialize Slice Tracker Parameter for a slave tracker\r
+       fSlaveTrackers[iSlice].Initialize(param);\r
+       if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)\r
+       {\r
+               HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);\r
+               return(1);\r
+       }\r
+       return(0);\r
+}\r
+\r
+int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()\r
+{\r
+       //Uninitialize CUDA\r
+       cudaThreadSynchronize();\r
+       if (fGPUMemory)\r
+       {\r
+               cudaFree(fGPUMemory);\r
+               fGPUMemory = NULL;\r
+       }\r
+       if (fHostLockedMemory)\r
+       {\r
+               for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)\r
+               {\r
+                       cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);\r
+               }\r
+               free(fpCudaStreams);\r
+               fGpuTracker = NULL;\r
+               cudaFreeHost(fHostLockedMemory);\r
+       }\r
+\r
+       if (CudaFailedMsg(cudaThreadExit()))\r
+       {\r
+               HLTError("Could not uninitialize GPU");\r
+               return(1);\r
+       }\r
+       HLTInfo("CUDA Uninitialized");\r
+       fgGPUUsed = false;\r
+       fCudaInitialized = 0;\r
+       return(0);\r
+}\r
+\r
+void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)\r
+{\r
+       //Set Output Control Pointers\r
+       fOutputControl = val;\r
+       for (int i = 0;i < fgkNSlices;i++)\r
+       {\r
+               fSlaveTrackers[i].SetOutputControl(val);\r
+       }\r
+}\r
+\r
+int AliHLTTPCCAGPUTrackerNVCC::GetThread()\r
+{\r
+       //Get Thread ID\r
+#ifdef R__WIN32\r
+       return((int) (size_t) GetCurrentThread());\r
+#else\r
+       return((int) syscall (SYS_gettid));\r
+#endif\r
+}\r
+\r
+unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)\r
+{\r
+       //Returns pointer to PerfTimer i of slice iSlice\r
+       return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);\r
+}\r
+\r
+const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const\r
+{\r
+       //Return Pointer to Output Control Structure\r
+       return fOutputControl;\r
+}\r
+\r
+int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const\r
+{\r
+       //Return max slice count processable\r
+       return(fSliceCount);\r
+}\r
+\r
+AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()\r
+{\r
+       return new AliHLTTPCCAGPUTrackerNVCC;\r
+}\r
+\r
+void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)\r
+{\r
+       delete ptr;\r
+}\r
+\r