]> git.uio.no Git - u/mrichter/AliRoot.git/commitdiff
update of the GPU tracker
authorsgorbuno <sgorbuno@f7af4fe6-9843-0410-8265-dc069ae4e863>
Fri, 23 Jul 2010 13:08:49 +0000 (13:08 +0000)
committersgorbuno <sgorbuno@f7af4fe6-9843-0410-8265-dc069ae4e863>
Fri, 23 Jul 2010 13:08:49 +0000 (13:08 +0000)
17 files changed:
HLT/TPCLib/tracking-ca/AliHLTTPCCAClusterData.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUConfig.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu [deleted file]
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu.patch [deleted file]
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cxx [deleted file]
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.h [deleted file]
HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCASliceOutput.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.h
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.h
HLT/TPCLib/tracking-ca/MemoryAssignmentHelpers.h

index f44e64f5cf9634c72cb301b670f0f2e339563732..f505622d29d04201593d2f57f93844a857b17f11 100644 (file)
@@ -88,7 +88,6 @@ template <class T> void AliHLTTPCCAClusterData::ReadEventVector(std::vector<T> &
 #ifdef HLTCA_STANDALONE
                if (tmpData[j].fRow < 0 || tmpData[j].fRow >= HLTCA_ROW_COUNT)
                {
-                       printf("Invalid Row Read %d at Cluster %d\n", tmpData[j].fRow, j);
                        exit(1);
                }
 #endif
index 21f337c6bd300a2642ede3ee90f70eea44d3abaa..b7ba96286beec1c5fe4bd4a865ff1e3b637dfa6e 100644 (file)
@@ -2,7 +2,7 @@
 #define ALIHLTTPCCAGPUCONFIG_H
 
 //GPU Run Configuration
-#define HLTCA_GPU_BLOCK_COUNT 30
+#define HLTCA_GPU_BLOCK_COUNT 15
 #define HLTCA_GPU_THREAD_COUNT 256
 
 //GPU Parameters
@@ -14,8 +14,6 @@
 
 #define HLTCA_GPU_ROWALIGNMENT uint4                                   //Align Row Hits and Grid
 #define HLTCA_GPU_ROWCOPY int                                                  //must not be bigger than row alignment!!!
-#define HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS 32   //Amound of threads to reserve for memory copy
-//#define HLTCA_GPU_PREFETCHDATA                                               //Fetch Row Data (Hits / Grid) into shared memory during Tracklet Construction
 
 #define HLTCA_GPU_SCHED_ROW_STEP 32                                            //Amount of Rows to process in one step before rescheduling
 #define HLTCA_GPU_SCHED_FIXED_START                                            //Assign each GPU thread a start tracklet to start with instead of using the scheduler to obtain start tracklet
@@ -39,6 +37,8 @@
 //#define HLTCA_GPU_EMULATION_SINGLE_TRACKLET 1313             //Run Tracklet constructor on on single Tracklet in Device Emulation Mode
 //#define HLTCA_GPU_EMULATION_DEBUG_TRACKLET 1313
 
+#define HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT 12
+
 #define HLTCA_GPU_TRACKER_CONSTANT_MEM 65000                   //Amount of Constant Memory to reserve
 
 #define HLTCA_GPU_TRACKER_OBJECT_MEMORY 1024 * 1024            //Total amount of Memory to reserve for GPU Tracker Objects
 #ifdef HLTCA_GPU_TEXTURE_FETCHa
 #undef HLTCA_GPU_TEXTURE_FETCHa
 #endif
-//No Shared memory cache for CPU Tracker
-#ifdef HLTCA_GPU_PREFETCHDATA
-#undef HLTCA_GPU_PREFETCHDATA
-#endif
+
 //Do not cache Row Hits during Tracklet selection in Registers for CPU Tracker
 #undef HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE
 #define HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE 0
 #define HLTCA_GPU_SORT_STARTHITS
 #endif
 
-//If not using Row Based schreduling or not using shared memory cache do not reserve threads for shared memory copy
-#if !defined(HLTCA_GPU_PREFETCHDATA) | !defined(HLTCA_GPU_RESCHED)
-#undef HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS
-#define HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS 0
-#endif
-
 //Error Codes for GPU Tracker
 #define HLTCA_GPU_ERROR_NONE 0
 #define HLTCA_GPU_ERROR_ROWBLOCK_TRACKLET_OVERFLOW 1
index 889e227e5b18d7ccd22a87d876715daf0c87ccca..e938d318f9254e6fd29d2c5d3b93477e6c22fcc4 100644 (file)
@@ -26,7 +26,6 @@ int AliHLTTPCCAGPUTracker::InitGPU(int /*sliceCount*/, int /*forceDeviceID*/) {
 void AliHLTTPCCAGPUTracker::SetDebugLevel(int /*dwLevel*/, std::ostream* /*NewOutFile*/) {}
 int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* /*OptionName*/, int /*OptionValue*/) {return(1);}
 int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** /*pTracker*/, AliHLTTPCCAClusterData* /*pClusterData*/, int /*fFirstSlice*/, int /*fSliceCount*/) {return(1);}
-int AliHLTTPCCAGPUTracker::ReconstructPP(AliHLTTPCCASliceOutput** /*pTracker*/, AliHLTTPCCAClusterData* /*pClusterData*/, int /*fFirstSlice*/, int /*fSliceCount*/) {return(1);}
 int AliHLTTPCCAGPUTracker::ExitGPU() {return(0);}
 int AliHLTTPCCAGPUTracker::InitializeSliceParam(int /*iSlice*/, AliHLTTPCCAParam& /*param*/) { return 1; }
 void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* /*val*/) {}
index 3acd461742bfc98043bc4a5855806b0ca88ab798..831c71b358c794aecfda231fa74f56054063ed9c 100644 (file)
@@ -23,9 +23,8 @@ public:
        AliHLTTPCCAGPUTracker();
        virtual ~AliHLTTPCCAGPUTracker();
 
-       virtual int InitGPU(int sliceCount = 12, int forceDeviceID = -1);
+       virtual int InitGPU(int sliceCount = -1, int forceDeviceID = -1);
        virtual int Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
-       virtual int ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
        virtual int ExitGPU();
 
        virtual void SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile = NULL);
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu
deleted file mode 100644 (file)
index b8327b5..0000000
+++ /dev/null
@@ -1,1303 +0,0 @@
-// **************************************************************************\r
-// This file is property of and copyright by the ALICE HLT Project          *\r
-// ALICE Experiment at CERN, All rights reserved.                           *\r
-//                                                                          *\r
-// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *\r
-//                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *\r
-//                                     David Rohr <drohr@kip.uni-heidelberg.de>                                *\r
-//                  for The ALICE HLT Project.                              *\r
-//                                                                          *\r
-// Permission to use, copy, modify and distribute this software and its     *\r
-// documentation strictly for non-commercial purposes is hereby granted     *\r
-// without fee, provided that the above copyright notice appears in all     *\r
-// copies and that both the copyright notice and this permission notice     *\r
-// appear in the supporting documentation. The authors make no claims       *\r
-// about the suitability of this software for any purpose. It is            *\r
-// provided "as is" without express or implied warranty.                    *\r
-//                                                                          *\r
-//***************************************************************************\r
-\r
-#include "AliHLTTPCCAGPUTrackerNVCC.h"\r
-\r
-#ifdef HLTCA_GPUCODE\r
-#include <cuda.h>\r
-#include <sm_11_atomic_functions.h>\r
-#include <sm_12_atomic_functions.h>\r
-#endif\r
-\r
-#ifdef R__WIN32\r
-#else\r
-#include <sys/syscall.h>\r
-#include <semaphore.h>\r
-#include <fcntl.h>\r
-#endif\r
-\r
-#include "AliHLTTPCCADef.h"\r
-#include "AliHLTTPCCAGPUConfig.h"\r
-\r
-\r
-#include <iostream>\r
-\r
-//Disable assertions since they produce errors in GPU Code\r
-#ifdef assert\r
-#undef assert\r
-#endif\r
-#define assert(param)\r
-\r
-__constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];\r
-#ifdef HLTCA_GPU_TEXTURE_FETCH\r
-texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;\r
-texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;\r
-texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;\r
-#endif\r
-\r
-//Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code\r
-#include "AliHLTTPCCATrackParam.cxx"\r
-#include "AliHLTTPCCATrack.cxx" \r
-\r
-#include "AliHLTTPCCAHitArea.cxx"\r
-#include "AliHLTTPCCAGrid.cxx"\r
-#include "AliHLTTPCCARow.cxx"\r
-#include "AliHLTTPCCAParam.cxx"\r
-#include "AliHLTTPCCATracker.cxx"\r
-\r
-#include "AliHLTTPCCAProcess.h"\r
-\r
-#include "AliHLTTPCCATrackletSelector.cxx"\r
-#include "AliHLTTPCCANeighboursFinder.cxx"\r
-#include "AliHLTTPCCANeighboursCleaner.cxx"\r
-#include "AliHLTTPCCAStartHitsFinder.cxx"\r
-#include "AliHLTTPCCAStartHitsSorter.cxx"\r
-#include "AliHLTTPCCATrackletConstructor.cxx"\r
-\r
-#include "MemoryAssignmentHelpers.h"\r
-\r
-#ifndef HLTCA_STANDALONE\r
-#include "AliHLTDefinitions.h"\r
-#include "AliHLTSystem.h"\r
-#endif\r
-\r
-ClassImp( AliHLTTPCCAGPUTrackerNVCC )\r
-\r
-bool AliHLTTPCCAGPUTrackerNVCC::fgGPUUsed = false;\r
-\r
-#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"\r
-\r
-AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :\r
-       fGpuTracker(NULL),\r
-       fGPUMemory(NULL),\r
-       fHostLockedMemory(NULL),\r
-       fDebugLevel(0),\r
-       fDebugMask(0xFFFFFFFF),\r
-       fOutFile(NULL),\r
-       fGPUMemSize(0),\r
-       fpCudaStreams(NULL),\r
-       fSliceCount(0),\r
-       fOutputControl(NULL),\r
-       fThreadId(0),\r
-       fCudaInitialized(0),\r
-       fPPMode(0)\r
-       {};\r
-\r
-AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() {};\r
-\r
-void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)\r
-{\r
-       //Release the global named semaphore that locks GPU Initialization\r
-#ifdef R__WIN32\r
-       HANDLE* h = (HANDLE*) sem;\r
-       ReleaseSemaphore(*h, 1, NULL);\r
-       CloseHandle(*h);\r
-       delete h;\r
-#else\r
-       sem_t* pSem = (sem_t*) sem;\r
-       sem_post(pSem);\r
-       sem_unlink(SemLockName);\r
-#endif\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)\r
-{\r
-       //Check constants for correct memory sizes\r
-  if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)\r
-  {\r
-         HLTError("Insufficiant Tracker Object Memory");\r
-         return(1);\r
-  }\r
-\r
-  if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)\r
-  {\r
-         HLTError("Insufficiant Common Memory");\r
-         return(1);\r
-  }\r
-\r
-  if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)\r
-  {\r
-         HLTError("Insufficiant Row Memory");\r
-         return(1);\r
-  }\r
-\r
-  if (fDebugLevel >= 3)\r
-  {\r
-         HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, fgkNSlices * AliHLTTPCCATracker::CommonMemorySize(), HLTCA_GPU_COMMON_MEMORY, fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), HLTCA_GPU_ROWS_MEMORY);\r
-  }\r
-  return(0);\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)\r
-{\r
-       //Find best CUDA device, initialize and allocate memory\r
-\r
-       if (CheckMemorySizes(sliceCount)) return(1);\r
-\r
-#ifdef R__WIN32\r
-       HANDLE* semLock = new HANDLE;\r
-       *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);\r
-       if (*semLock == NULL)\r
-       {\r
-               HLTError("Error creating GPUInit Semaphore");\r
-               return(1);\r
-       }\r
-       WaitForSingleObject(*semLock, INFINITE);\r
-#else\r
-       sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);\r
-       if (semLock == SEM_FAILED)\r
-       {\r
-               HLTError("Error creating GPUInit Semaphore");\r
-               return(1);\r
-       }\r
-       sem_wait(semLock);\r
-#endif\r
-\r
-       if (fgGPUUsed)\r
-       {\r
-           HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");\r
-               ReleaseGlobalLock(semLock);\r
-           return(1);\r
-       }\r
-       fgGPUUsed = 1;\r
-       fThreadId = GetThread();\r
-\r
-       cudaDeviceProp fCudaDeviceProp;\r
-\r
-       fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);\r
-\r
-#ifndef CUDA_DEVICE_EMULATION\r
-       int count, bestDevice = -1;\r
-       long long int bestDeviceSpeed = 0, deviceSpeed;\r
-       if (CudaFailedMsg(cudaGetDeviceCount(&count)))\r
-       {\r
-               HLTError("Error getting CUDA Device Count");\r
-               fgGPUUsed = 0;\r
-               ReleaseGlobalLock(semLock);\r
-               return(1);\r
-       }\r
-       if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");\r
-       for (int i = 0;i < count;i++)\r
-       {\r
-               unsigned int free, total;\r
-               cuInit(0);\r
-               CUdevice tmpDevice;\r
-               cuDeviceGet(&tmpDevice, i);\r
-               CUcontext tmpContext;\r
-               cuCtxCreate(&tmpContext, 0, tmpDevice);\r
-               if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";\r
-               cuCtxDestroy(tmpContext);\r
-               if (CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;\r
-\r
-               int deviceOK = sliceCount <= fCudaDeviceProp.multiProcessorCount && fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;\r
-\r
-               if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");\r
-               deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;\r
-               if (deviceOK && deviceSpeed > bestDeviceSpeed)\r
-               {\r
-                       bestDevice = i;\r
-                       bestDeviceSpeed = deviceSpeed;\r
-               }\r
-       }\r
-       if (bestDevice == -1)\r
-       {\r
-               HLTWarning("No CUDA Device available, aborting CUDA Initialisation");\r
-               HLTInfo("Requiring Revision 1.3, Mem: %d", fGPUMemSize);\r
-               fgGPUUsed = 0;\r
-               ReleaseGlobalLock(semLock);\r
-               return(1);\r
-       }\r
-\r
-  int cudaDevice;\r
-  if (forceDeviceID == -1)\r
-         cudaDevice = bestDevice;\r
-  else\r
-         cudaDevice = forceDeviceID;\r
-#else\r
-       int cudaDevice = 0;\r
-#endif\r
-\r
-  cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice ); \r
-\r
-  if (fDebugLevel >= 1)\r
-  {\r
-         HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);\r
-         HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);\r
-         HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);\r
-         HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);\r
-         HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);\r
-         HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);\r
-         HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);\r
-         HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);\r
-         HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);\r
-         HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);\r
-         HLTInfo("major = %d", fCudaDeviceProp.major);\r
-         HLTInfo("minor = %d", fCudaDeviceProp.minor);\r
-         HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);\r
-         HLTInfo("textureAlignment %lld= ", (unsigned long long int) fCudaDeviceProp.textureAlignment);\r
-  }\r
-\r
-  if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))\r
-  {\r
-       HLTError( "Unsupported CUDA Device" );\r
-       fgGPUUsed = 0;\r
-       ReleaseGlobalLock(semLock);\r
-       return(1);\r
-  }\r
-\r
-  if (CudaFailedMsg(cudaSetDevice(cudaDevice)))\r
-  {\r
-         HLTError("Could not set CUDA Device!");\r
-         fgGPUUsed = 0;\r
-         ReleaseGlobalLock(semLock);\r
-         return(1);\r
-  }\r
-\r
-  if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))\r
-  {\r
-         HLTError("CUDA Memory Allocation Error");\r
-         cudaThreadExit();\r
-         fgGPUUsed = 0;\r
-         ReleaseGlobalLock(semLock);\r
-         return(1);\r
-  }\r
-  ReleaseGlobalLock(semLock);\r
-  if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);\r
-  int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;\r
-  if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))\r
-  {\r
-         cudaFree(fGPUMemory);\r
-         cudaThreadExit();\r
-         HLTError("Error allocating Page Locked Host Memory");\r
-         fgGPUUsed = 0;\r
-         return(1);\r
-  }\r
-  if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);\r
-\r
-  if (fDebugLevel >= 1)\r
-  {\r
-         CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));\r
-  }\r
-\r
-  fSliceCount = sliceCount;\r
-  //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory\r
-  fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);\r
-\r
-  for (int i = 0;i < fgkNSlices;i++)\r
-  {\r
-    fSlaveTrackers[i].SetGPUTracker();\r
-       fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));\r
-       fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));\r
-  }\r
-\r
-  fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));\r
-  cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;\r
-  for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)\r
-  {\r
-       if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))\r
-       {\r
-           cudaFree(fGPUMemory);\r
-           cudaFreeHost(fHostLockedMemory);\r
-           cudaThreadExit();\r
-           HLTError("Error creating CUDA Stream");\r
-           fgGPUUsed = 0;\r
-           return(1);\r
-       }\r
-  }\r
-\r
-  fCudaInitialized = 1;\r
-  HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", cudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount);\r
-\r
-#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)\r
-  if (fDebugLevel < 2)\r
-  {\r
-         //Do one initial run for Benchmark reasons\r
-         const int useDebugLevel = fDebugLevel;\r
-         fDebugLevel = 0;\r
-         AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];\r
-\r
-         std::ifstream fin;\r
-\r
-         AliHLTTPCCAParam tmpParam;\r
-         AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;\r
-\r
-         fin.open("events/settings.dump");\r
-         int tmpCount;\r
-         fin >> tmpCount;\r
-         for (int i = 0;i < sliceCount;i++)\r
-         {\r
-               fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);\r
-               tmpParam.ReadSettings(fin);\r
-               InitializeSliceParam(i, tmpParam);\r
-         }\r
-         fin.close();\r
-\r
-         fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);\r
-         for (int i = 0;i < sliceCount;i++)\r
-         {\r
-               tmpCluster[i].StartReading(i, 0);\r
-               tmpCluster[i].ReadEvent(fin);\r
-               tmpCluster[i].FinishReading();\r
-         }\r
-         fin.close();\r
-\r
-         AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];\r
-         memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));\r
-\r
-         Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);\r
-         for (int i = 0;i < sliceCount;i++)\r
-         {\r
-                 free(tmpOutput[i]);\r
-                 tmpOutput[i] = NULL;\r
-                 fSlaveTrackers[i].SetOutputControl(NULL);\r
-         }\r
-         delete[] tmpOutput;\r
-         delete[] tmpCluster;\r
-         fDebugLevel = useDebugLevel;\r
-  }\r
-#endif\r
-  return(0);\r
-}\r
-\r
-template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)\r
-{\r
-       //Macro to align Pointers.\r
-       //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker\r
-       //(As long as every single data structure is <= 1 MB)\r
-\r
-       size_t adr = (size_t) ptr;\r
-       if (adr % alignment)\r
-       {\r
-               adr += alignment - (adr % alignment);\r
-       }\r
-       return((T*) adr);\r
-}\r
-\r
-bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsg(cudaError_t error)\r
-{\r
-       //Check for CUDA Error and in the case of an error display the corresponding error string\r
-       if (error == cudaSuccess) return(false);\r
-       HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));\r
-       return(true);\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice)\r
-{\r
-       //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards\r
-\r
-       if (fDebugLevel == 0) return(0);\r
-       cudaError cuErr;\r
-       cuErr = cudaGetLastError();\r
-       if (cuErr != cudaSuccess)\r
-       {\r
-               HLTError("Cuda Error %s while running kernel (%s) (Slice %d; %d/%d)", cudaGetErrorString(cuErr), state, sliceLocal, slice, fgkNSlices);\r
-               return(1);\r
-       }\r
-       if (CudaFailedMsg(cudaThreadSynchronize()))\r
-       {\r
-               HLTError("CUDA Error while synchronizing (%s) (Slice %d; %d/%d)", state, sliceLocal, slice, fgkNSlices);\r
-               return(1);\r
-       }\r
-       if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");\r
-       return(0);\r
-}\r
-\r
-void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)\r
-{\r
-       //Set Debug Level and Debug output File if applicable\r
-       fDebugLevel = dwLevel;\r
-       if (NewOutFile) fOutFile = NewOutFile;\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int OptionValue)\r
-{\r
-       //Set a specific GPU Tracker Option\r
-       if (strcmp(OptionName, "PPMode") == 0)\r
-       {\r
-               fPPMode = OptionValue;\r
-       }\r
-       else if (strcmp(OptionName, "DebugMask") == 0)\r
-       {\r
-               fDebugMask = OptionValue;\r
-       }\r
-       else\r
-       {\r
-               HLTError("Unknown Option: %s", OptionName);\r
-               return(1);\r
-       }\r
-       return(0);\r
-}\r
-\r
-#ifdef HLTCA_STANDALONE\r
-void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)\r
-{\r
-  //Run Performance Query for timer i of slice iSlice\r
-  if (fDebugLevel >= 1)\r
-  {\r
-         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));\r
-  }\r
-}\r
-#else\r
-void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}\r
-#endif\r
-\r
-void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)\r
-{\r
-       //Dump Rowblocks to File\r
-       if (fDebugLevel >= 4)\r
-       {\r
-               *fOutFile << "RowBlock Tracklets (Slice " << tracker[iSlice].Param().ISlice() << " (" << iSlice << " of reco))";\r
-               *fOutFile << " after Tracklet Reconstruction";\r
-               *fOutFile << std::endl;\r
-       \r
-               int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);\r
-               int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);\r
-               uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);\r
-               CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));\r
-               CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));\r
-               CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));\r
-               CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
-\r
-               int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;\r
-               for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)\r
-               {\r
-                       *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << \r
-                               rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";\r
-                       for (int j = 0;j < rowBlockPos[i].x;j++)\r
-                       {\r
-                               //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!\r
-                               *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";\r
-#ifdef HLTCA_GPU_SCHED_FIXED_START\r
-                               if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)\r
-                               {\r
-                                       HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);\r
-                               }\r
-#endif //HLTCA_GPU_SCHED_FIXED_START\r
-                               k++;\r
-                               if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)\r
-                               {\r
-                                       HLTError("Error, -1 Tracklet found");\r
-                               }\r
-                       }\r
-                       *fOutFile << std::endl << "Phase 2: ";\r
-                       for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)\r
-                       {\r
-                               *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";\r
-                       }\r
-                       *fOutFile << std::endl;\r
-               }\r
-\r
-               if (check)\r
-               {\r
-                       *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;\r
-                       for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)\r
-                       {\r
-                               *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;\r
-                       }\r
-               }\r
-\r
-               free(rowBlockPos);\r
-               free(rowBlockTracklets);\r
-               free(blockStartingTracklet);\r
-       }\r
-}\r
-\r
-__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)\r
-{\r
-       //Initialize GPU RowBlocks and HitWeights\r
-       int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;\r
-       int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;\r
-       const int stride = blockDim.x * gridDim.x;\r
-       int4 i0, i1;\r
-       i0.x = i0.y = i0.z = i0.w = 0;\r
-       i1.x = i1.y = i1.z = i1.w = -1;\r
-       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)\r
-               RowBlockPos[i] = i0;\r
-       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)\r
-               rowBlockTracklets4[i] = i1;\r
-       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)\r
-               sliceDataHitWeights4[i] = i0;\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)\r
-{\r
-       //Primary reconstruction function\r
-\r
-       cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;\r
-\r
-       if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;\r
-       \r
-       if (!fCudaInitialized)\r
-       {\r
-           HLTError("GPUTracker not initialized");\r
-           return(1);\r
-       }\r
-       if (sliceCountLocal > fSliceCount)\r
-       {\r
-           HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);\r
-           return(1);\r
-       }\r
-       if (fThreadId != GetThread())\r
-       {\r
-           HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());\r
-           return(1);\r
-       }\r
-\r
-       if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);\r
-\r
-       if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)\r
-       {\r
-               HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));\r
-               return(1);\r
-       }\r
-\r
-       if (fPPMode) return(ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal));\r
-\r
-       memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);\r
-\r
-       if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");\r
-\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
-       unsigned long long int a, b, c, d;\r
-       AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);\r
-       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);\r
-#endif\r
-       \r
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
-       {\r
-               //Make this a GPU Tracker\r
-               fGpuTracker[iSlice].SetGPUTracker();\r
-               fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));\r
-               fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));\r
-               fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);\r
-\r
-               //Set Pointers to GPU Memory\r
-               char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);\r
-\r
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");\r
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());\r
-               tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
-\r
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");\r
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);\r
-               tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
-\r
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");\r
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());\r
-               tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
-\r
-               if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)\r
-               {\r
-                       HLTError("Insufficiant Track Memory");\r
-                       return(1);\r
-               }\r
-\r
-               if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)\r
-               {\r
-                       HLTError("Insufficiant Global Memory");\r
-                       return(1);\r
-               }\r
-\r
-               if (fDebugLevel >= 3)\r
-               {\r
-                       HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice), HLTCA_GPU_GLOBAL_MEMORY, fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY);\r
-               }\r
-\r
-               //Initialize Startup Constants\r
-               *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;\r
-               *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;\r
-               *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;\r
-               fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;\r
-               if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);\r
-               fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;\r
-               fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;\r
-               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;\r
-               fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());\r
-       }\r
-\r
-#ifdef HLTCA_GPU_TEXTURE_FETCH\r
-               cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();\r
-               size_t offset;\r
-               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
-               {\r
-                       HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);\r
-                       return(1);\r
-               }\r
-               cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();\r
-               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
-               {\r
-                       HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);\r
-                       return(1);\r
-               }\r
-               cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();\r
-               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
-               {\r
-                       HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);\r
-                       return(1);\r
-               }\r
-#endif\r
-\r
-       //Copy Tracker Object to GPU Memory\r
-       if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");\r
-#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE\r
-       char* tmpMem;\r
-       if (CudaFailedMsg(cudaMalloc(&tmpMem, 100000000))) return(1);\r
-       fGpuTracker[0].fStageAtSync = tmpMem;\r
-       CudaFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));\r
-#endif\r
-       CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));\r
-\r
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
-       {\r
-               StandalonePerfTime(firstSlice + iSlice, 0);\r
-\r
-               //Initialize GPU Slave Tracker\r
-               if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");\r
-               fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
-                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
-#endif\r
-               fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);\r
-\r
-                 if (fDebugLevel >= 4)\r
-                 {\r
-                         *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl;\r
-\r
-                         if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);\r
-                 }\r
-\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
-                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
-               printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
-#endif\r
-               if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)\r
-               {\r
-                       HLTError("Insufficiant Slice Data Memory");\r
-                       return(1);\r
-               }\r
-\r
-               if (fDebugLevel >= 3)\r
-               {\r
-                       HLTInfo("GPU Slice Data Memory Used: %d/%d", fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY);\r
-               }\r
-\r
-               //Initialize temporary memory where needed\r
-               if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");           \r
-               PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());\r
-\r
-               //Copy Data to GPU Global Memory\r
-               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
-               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
-               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
-\r
-               if (fDebugLevel >= 4)\r
-               {\r
-                       if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");\r
-                       fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS);\r
-                       fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );\r
-               }\r
-               \r
-               if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);\r
-               StandalonePerfTime(firstSlice + iSlice, 1);\r
-\r
-               if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);\r
-               AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
-\r
-               if (CUDASync("Neighbours finder", iSlice, iSlice + firstSlice)) return 1;\r
-\r
-               StandalonePerfTime(firstSlice + iSlice, 2);\r
-\r
-               if (fDebugLevel >= 4)\r
-               {\r
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));\r
-                       if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);\r
-               }\r
-\r
-               if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);\r
-               AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
-               if (CUDASync("Neighbours Cleaner", iSlice, iSlice + firstSlice)) return 1;\r
-\r
-               StandalonePerfTime(firstSlice + iSlice, 3);\r
-\r
-               if (fDebugLevel >= 4)\r
-               {\r
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));\r
-                       if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);\r
-               }\r
-\r
-               if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);\r
-               AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
-               if (CUDASync("Start Hits Finder", iSlice, iSlice + firstSlice)) return 1;\r
-\r
-               StandalonePerfTime(firstSlice + iSlice, 4);\r
-\r
-               if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);\r
-               AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
-               if (CUDASync("Start Hits Sorter", iSlice, iSlice + firstSlice)) return 1;\r
-\r
-               StandalonePerfTime(firstSlice + iSlice, 5);\r
-\r
-               if (fDebugLevel >= 2)\r
-               {\r
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
-                       if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);\r
-                       if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)\r
-                       {\r
-                               HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");\r
-                               return(1);\r
-                       }\r
-               }\r
-\r
-               if (fDebugLevel >= 4)\r
-               {\r
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));\r
-                       if (fDebugMask & 8)\r
-                       {\r
-                               *fOutFile << "Temporary ";\r
-                               fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);\r
-                       }\r
-                       uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());\r
-                       CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));\r
-                       if (fDebugMask & 16)\r
-                       {\r
-                               *fOutFile << "Start Hits Sort Vector:" << std::endl;\r
-                               for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)\r
-                               {\r
-                                       *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;\r
-                               }\r
-                       }\r
-                       free(tmpMemory);\r
-\r
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));\r
-                       if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);\r
-               }\r
-\r
-               StandalonePerfTime(firstSlice + iSlice, 6);\r
-               \r
-               fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());\r
-       }\r
-\r
-       StandalonePerfTime(firstSlice, 7);\r
-#ifdef HLTCA_GPU_PREFETCHDATA\r
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
-       {\r
-               if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))\r
-               {\r
-                       HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));\r
-                       return(1);\r
-               }\r
-               if (fDebugLevel >= 1)\r
-               {\r
-                       static int infoShown = 0;\r
-                       if (!infoShown)\r
-                       {\r
-                               HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));\r
-                               infoShown = 1;\r
-                       }\r
-               }\r
-       }\r
-#endif\r
-\r
-       int nHardCollisions = 0;\r
-\r
-RestartTrackletConstructor:\r
-       if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");\r
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
-       {\r
-               AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);\r
-               if (CUDASync("Tracklet Initializer", iSlice, iSlice + firstSlice)) return 1;\r
-               if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice);\r
-       }\r
-\r
-       if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");\r
-       AliHLTTPCCATrackletConstructorGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();\r
-       if (CUDASync("Tracklet Constructor", 0, firstSlice)) return 1;\r
-       \r
-       StandalonePerfTime(firstSlice, 8);\r
-\r
-       if (fDebugLevel >= 4)\r
-       {\r
-               for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
-               {\r
-                       if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);\r
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
-                       if (fDebugLevel >= 5)\r
-                       {\r
-                               HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());\r
-                       }\r
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));\r
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));\r
-                       if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);\r
-               }\r
-       }\r
-\r
-       int runSlices = 0;\r
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)\r
-       {\r
-               if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;\r
-               if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);\r
-               AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));\r
-               if (CUDASync("Tracklet Selector", iSlice, iSlice + firstSlice)) return 1;\r
-       }\r
-       StandalonePerfTime(firstSlice, 9);\r
-\r
-       int tmpSlice = 0, tmpSlice2 = 0;\r
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
-       {\r
-               if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");\r
-\r
-               while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == CUDA_SUCCESS))\r
-               {\r
-                       if (CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice]))) return(1);\r
-                       tmpSlice++;\r
-               }\r
-\r
-               while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == CUDA_SUCCESS)\r
-               {\r
-                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));\r
-                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));\r
-                       tmpSlice2++;\r
-               }\r
-\r
-               cudaStreamSynchronize(cudaStreams[iSlice]);\r
-\r
-               if (fDebugLevel >= 4)\r
-               {\r
-                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));\r
-                       if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);\r
-                       if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);\r
-               }\r
-\r
-               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)\r
-               {\r
-                       if ((fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION || fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_WRONG_ROW)&& nHardCollisions++ < 10)\r
-                       {\r
-                               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION)\r
-                               {\r
-                                       HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");\r
-                               }\r
-                               else\r
-                               {\r
-                                       HLTWarning("Tracklet Constructor returned invalid row");\r
-                               }\r
-                               for (int i = 0;i < sliceCountLocal;i++)\r
-                               {\r
-                                       cudaThreadSynchronize();\r
-                                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
-                                       *fSlaveTrackers[firstSlice + i].NTracks() = 0;\r
-                                       *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;\r
-                                       fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;\r
-                                       CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));\r
-                                       PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());\r
-                               }\r
-                               goto RestartTrackletConstructor;\r
-                       }\r
-                       HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);\r
-                       return(1);\r
-               }\r
-               if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());\r
-\r
-               fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
-               AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
-#endif\r
-               fSlaveTrackers[firstSlice + iSlice].WriteOutput();\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
-               AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
-               printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
-#endif\r
-\r
-               if (fDebugLevel >= 4)\r
-               {\r
-                       delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();\r
-                       delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();\r
-               }\r
-       }\r
-\r
-       StandalonePerfTime(firstSlice, 10);\r
-\r
-       if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");\r
-\r
-#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE\r
-       char* stageAtSync = (char*) malloc(100000000);\r
-       CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));\r
-       cudaFree(fGpuTracker[0].StageAtSync());\r
-\r
-       FILE* fp = fopen("profile.txt", "w+");\r
-       FILE* fp2 = fopen("profile.bmp", "w+b");\r
-       int nEmptySync = 0, fEmpty;\r
-\r
-       const int bmpheight = 1000;\r
-       BITMAPFILEHEADER bmpFH;\r
-       BITMAPINFOHEADER bmpIH;\r
-       ZeroMemory(&bmpFH, sizeof(bmpFH));\r
-       ZeroMemory(&bmpIH, sizeof(bmpIH));\r
-       \r
-       bmpFH.bfType = 19778; //"BM"\r
-       bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;\r
-       bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);\r
-\r
-       bmpIH.biSize = sizeof(bmpIH);\r
-       bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;\r
-       bmpIH.biHeight = bmpheight;\r
-       bmpIH.biPlanes = 1;\r
-       bmpIH.biBitCount = 32;\r
-\r
-       fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);\r
-       fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);  \r
-\r
-       for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)\r
-       {\r
-               fEmpty = 1;\r
-               for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)\r
-               {\r
-                       fprintf(fp, "%d\t", stageAtSync[i + j]);\r
-                       int color = 0;\r
-                       if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);\r
-                       if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);\r
-                       if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);\r
-                       if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);\r
-                       fwrite(&color, 1, sizeof(int), fp2);\r
-                       if (j > 0 && j % 32 == 0)\r
-                       {\r
-                               color = RGB(255, 255, 255);\r
-                               fwrite(&color, 1, 4, fp2);\r
-                       }\r
-                       if (stageAtSync[i + j]) fEmpty = 0;\r
-               }\r
-               fprintf(fp, "\n");\r
-               if (fEmpty) nEmptySync++;\r
-               else nEmptySync = 0;\r
-               //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;\r
-       }\r
-\r
-       fclose(fp);\r
-       fclose(fp2);\r
-       free(stageAtSync);\r
-#endif \r
-\r
-       return(0);\r
-}\r
-\r
-__global__ void ClearPPHitWeights(int sliceCount)\r
-{\r
-       //Clear HitWeights\r
-       \r
-       for (int k = 0;k < sliceCount;k++)\r
-       {\r
-               AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k];\r
-               int4* const pHitWeights = (int4*) tracker.Data().HitWeights();\r
-               const int dwCount = tracker.Data().NumberOfHitsPlusAlign();\r
-               const int stride = blockDim.x * gridDim.x;\r
-               int4 i0;\r
-               i0.x = i0.y = i0.z = i0.w = 0;\r
-       \r
-               for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < dwCount * sizeof(int) / sizeof(int4);i += stride)\r
-               {\r
-                       pHitWeights[i] = i0;\r
-               }\r
-       }\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)\r
-{\r
-       //Primary reconstruction function for small events (PP)\r
-\r
-       memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);\r
-\r
-       if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");\r
-\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
-       unsigned long long int a, b, c, d;\r
-       AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);\r
-       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);\r
-#endif\r
-\r
-       char* tmpSliceMemHost = (char*) SliceDataMemory(fHostLockedMemory, 0);\r
-       char* tmpSliceMemGpu = (char*) SliceDataMemory(fGPUMemory, 0);\r
-\r
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
-       {\r
-               StandalonePerfTime(firstSlice + iSlice, 0);\r
-\r
-               //Initialize GPU Slave Tracker\r
-               if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");\r
-               fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(tmpSliceMemHost, RowMemory(fHostLockedMemory, firstSlice + iSlice));\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
-                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
-#endif\r
-               fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
-                       AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
-               printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
-#endif\r
-               if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)\r
-               {\r
-                       HLTError("Insufficiant Slice Data Memory");\r
-                       return(1);\r
-               }\r
-\r
-               //Make this a GPU Tracker\r
-               fGpuTracker[iSlice].SetGPUTracker();\r
-               fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));\r
-\r
-\r
-               fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice));\r
-               fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);\r
-\r
-               tmpSliceMemHost += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();\r
-               tmpSliceMemHost = alignPointer(tmpSliceMemHost, 64 * 1024);\r
-               tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();\r
-               tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024);\r
-\r
-\r
-               //Set Pointers to GPU Memory\r
-               char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);\r
-\r
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");\r
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());\r
-               tmpMem = alignPointer(tmpMem, 64 * 1024);\r
-\r
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");\r
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);\r
-               tmpMem = alignPointer(tmpMem, 64 * 1024);\r
-\r
-               if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");\r
-               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());\r
-               tmpMem = alignPointer(tmpMem, 64 * 1024);\r
-\r
-               if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)\r
-               {\r
-                       HLTError("Insufficiant Track Memory");\r
-                       return(1);\r
-               }\r
-\r
-               if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)\r
-               {\r
-                       HLTError("Insufficiant Global Memory");\r
-                       return(1);\r
-               }\r
-\r
-               //Initialize Startup Constants\r
-               *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;\r
-               *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;\r
-               *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;\r
-               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;\r
-\r
-               fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());\r
-\r
-               if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);\r
-               StandalonePerfTime(firstSlice + iSlice, 1);\r
-       }\r
-\r
-#ifdef HLTCA_GPU_TEXTURE_FETCH\r
-               cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();\r
-               size_t offset;\r
-               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
-               {\r
-                       HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);\r
-                       return(1);\r
-               }\r
-               cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();\r
-               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
-               {\r
-                       HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);\r
-                       return(1);\r
-               }\r
-               cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();\r
-               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
-               {\r
-                       HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);\r
-                       return(1);\r
-               }\r
-#endif\r
-\r
-       //Copy Tracker Object to GPU Memory\r
-       if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");\r
-       CudaFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));\r
-\r
-       //Copy Data to GPU Global Memory\r
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
-       {\r
-               CudaFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));\r
-               //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize());\r
-       }\r
-       //CudaFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));\r
-       //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0)));\r
-       CudaFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));\r
-       CudaFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));\r
-\r
-       if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");\r
-       AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows());\r
-       if (CUDASync("Neighbours finder", 0, firstSlice)) return 1;\r
-       StandalonePerfTime(firstSlice, 2);\r
-       if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");\r
-       AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2);\r
-       if (CUDASync("Neighbours Cleaner", 0, firstSlice)) return 1;\r
-       StandalonePerfTime(firstSlice, 3);\r
-       if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");\r
-       AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6);\r
-       if (CUDASync("Start Hits Finder", 0, firstSlice)) return 1;\r
-       StandalonePerfTime(firstSlice, 4);\r
-\r
-       ClearPPHitWeights <<<30, 256>>>(sliceCountLocal);\r
-       if (CUDASync("Clear Hit Weights", 0, firstSlice)) return 1;\r
-\r
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
-       {\r
-               fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());\r
-       }\r
-\r
-       StandalonePerfTime(firstSlice, 7);\r
-\r
-       if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");\r
-       AliHLTTPCCATrackletConstructorGPUPP<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal);\r
-       if (CUDASync("Tracklet Constructor PP", 0, firstSlice)) return 1;\r
-       \r
-       StandalonePerfTime(firstSlice, 8);\r
-\r
-       AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal);\r
-       if (CUDASync("Tracklet Selector", 0, firstSlice)) return 1;\r
-       StandalonePerfTime(firstSlice, 9);\r
-\r
-       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));\r
-\r
-       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
-       {\r
-               if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");\r
-\r
-               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));\r
-               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));\r
-\r
-               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)\r
-               {\r
-                       HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);\r
-                       return(1);\r
-               }\r
-               if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());\r
-\r
-               fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
-               AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
-#endif\r
-               fSlaveTrackers[firstSlice + iSlice].WriteOutput();\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
-               AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
-               printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
-#endif\r
-       }\r
-\r
-       StandalonePerfTime(firstSlice, 10);\r
-\r
-       if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");\r
-\r
-       return(0);\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)\r
-{\r
-       //Initialize Slice Tracker Parameter for a slave tracker\r
-       fSlaveTrackers[iSlice].Initialize(param);\r
-       if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)\r
-       {\r
-               HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);\r
-               return(1);\r
-       }\r
-       return(0);\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()\r
-{\r
-       //Uninitialize CUDA\r
-       cudaThreadSynchronize();\r
-       if (fGPUMemory)\r
-       {\r
-               cudaFree(fGPUMemory);\r
-               fGPUMemory = NULL;\r
-       }\r
-       if (fHostLockedMemory)\r
-       {\r
-               for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)\r
-               {\r
-                       cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);\r
-               }\r
-               free(fpCudaStreams);\r
-               fGpuTracker = NULL;\r
-               cudaFreeHost(fHostLockedMemory);\r
-       }\r
-\r
-       if (CudaFailedMsg(cudaThreadExit()))\r
-       {\r
-               HLTError("Could not uninitialize GPU");\r
-               return(1);\r
-       }\r
-       HLTInfo("CUDA Uninitialized");\r
-       fgGPUUsed = false;\r
-       fCudaInitialized = 0;\r
-       return(0);\r
-}\r
-\r
-void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)\r
-{\r
-       //Set Output Control Pointers\r
-       fOutputControl = val;\r
-       for (int i = 0;i < fgkNSlices;i++)\r
-       {\r
-               fSlaveTrackers[i].SetOutputControl(val);\r
-       }\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::GetThread()\r
-{\r
-       //Get Thread ID\r
-#ifdef R__WIN32\r
-       return((int) (size_t) GetCurrentThread());\r
-#else\r
-       return((int) syscall (SYS_gettid));\r
-#endif\r
-}\r
-\r
-unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)\r
-{\r
-       //Returns pointer to PerfTimer i of slice iSlice\r
-       return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);\r
-}\r
-\r
-const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const\r
-{\r
-       //Return Pointer to Output Control Structure\r
-       return fOutputControl;\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const\r
-{\r
-       //Return max slice count processable\r
-       return(fSliceCount);\r
-}\r
-\r
-AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()\r
-{\r
-       return new AliHLTTPCCAGPUTrackerNVCC;\r
-}\r
-\r
-void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)\r
-{\r
-       delete ptr;\r
-}\r
-\r
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu.patch b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu.patch
deleted file mode 100644 (file)
index de94951..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
---- AliHLTTPCCAGPUTracker.cucpp        2009-05-28 12:14:09.000000000 +0200
-+++ release/x86_64-pc-linux-gnu/code/AliHLTTPCCAGPUTracker.cucpp       2009-05-28 12:10:25.000000000 +0200
-@@ -23186,7 +23186,7 @@
- static T2 *Alloc(int s) { auto T2 *p = (reinterpret_cast< T2 *>(_mm_malloc(s * sizeof(CacheLineSizeHelper< T> ), 128))); return new (p) T2 [s]; }
- static void Free(T2 *const p, int size) {
- for (int i = 0; i < size; ++i) {
--((p[i]).~CacheLineSizeHelper());
-+((p[i]).~T2());
- }
- _mm_free(p);
- }
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cxx
deleted file mode 100644 (file)
index 618c9dc..0000000
+++ /dev/null
@@ -1,168 +0,0 @@
-// $Id$
-// **************************************************************************
-// This file is property of and copyright by the ALICE HLT Project          *
-// ALICE Experiment at CERN, All rights reserved.                           *
-//                                                                          *
-// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
-//                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
-//                                     David Rohr <drohr@kip.uni-heidelberg.de>                                *
-//                  for The ALICE HLT Project.                              *
-//                                                                          *
-// Permission to use, copy, modify and distribute this software and its     *
-// documentation strictly for non-commercial purposes is hereby granted     *
-// without fee, provided that the above copyright notice appears in all     *
-// copies and that both the copyright notice and this permission notice     *
-// appear in the supporting documentation. The authors make no claims       *
-// about the suitability of this software for any purpose. It is            *
-// provided "as is" without express or implied warranty.                    *
-//                                                                          *
-//***************************************************************************
-
-//  @file   AliHLTTPCCAGPUTrackerNVCC.cxx
-//  @author David Rohr, Sergey Gorbunov
-//  @date   
-//  @brief  TPC CA Tracker for the NVIDIA GPU
-//  @note 
-
-#include "AliHLTTPCCAGPUTrackerNVCC.h"
-
-AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :
-       fGpuTracker(NULL),
-       fGPUMemory(NULL),
-       fHostLockedMemory(NULL),
-       fDebugLevel(0),
-       fOutFile(NULL),
-       fGPUMemSize(0),
-       fpCudaStreams(NULL),
-       fSliceCount(0),
-       fOutputControl(NULL),
-       fThreadId(0),
-       fCudaInitialized(0)
-       
-{
-  // see header file for class documentation
-  // or
-  // refer to README to build package
-  // or
-  // visit http://web.ift.uib.no/~kjeks/doc/alice-hlt
-};
-
-AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() 
-{
-  // see header file for class documentation
-};
-
-void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)
-{
-  // see header file for class documentation
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)
-{
-  // see header file for class documentation
-  return(0);
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
-{
-  // see header file for class documentation
-  return(0);
-}
-
-template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)
-{
-  // see header file for class documentation
-       return((T*) NULL);
-}
-
-bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsg(cudaError_t error)
-{
-  // see header file for class documentation
-       return(true);
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state)
-{
-  // see header file for class documentation
-       return(0);
-}
-
-void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
-{
-  // see header file for class documentation
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
-{
-       return(0);
-}
-
-void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) 
-{
-  // see header file for class documentation
-}
-
-void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
-{
-  // see header file for class documentation
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
-{
-  // see header file for class documentation
-       return(0);
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
-{
-  // see header file for class documentation
-       return(0);
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
-{
-  // see header file for class documentation
-       return(0);
-}
-
-void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
-{
-  // see header file for class documentation
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::GetThread()
-{
-  // see header file for class documentation
-    return(0);
-}
-
-unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)
-{
-  // see header file for class documentation
-    static unsigned long long int tmp;
-    return(&tmp);
-}
-
-const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const
-{
-       //Return Pointer to Output Control Structure
-       return fOutputControl;
-}
-
-int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const
-{
-       //Return max slice count processable
-       return(fSliceCount);
-}
-
-AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
-{
-  // see header file for class documentation
-       return new AliHLTTPCCAGPUTrackerNVCC;
-} 
-void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)
-{
-  // see header file for class documentation
-       delete ptr;
-}
-
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.h
deleted file mode 100644 (file)
index 01d32e1..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-//-*- Mode: C++ -*-
-// $Id$
-
-// ************************************************************************
-// This file is property of and copyright by the ALICE HLT Project        *
-// ALICE Experiment at CERN, All rights reserved.                         *
-// See cxx source for full Copyright notice                               *
-//                                                                        *
-//*************************************************************************
-
-//  @file   AliHLTTPCCAGPUTrackerNVCC.h
-//  @author David Rohr, Sergey Gorbunov
-//  @date   
-//  @brief  TPC CA Tracker for the NVIDIA GPU
-//  @note 
-
-
-#ifndef ALIHLTTPCCAGPUTRACKERNVCC_H
-#define ALIHLTTPCCAGPUTRACKERNVCC_H
-
-#include "AliHLTTPCCAGPUTracker.h"
-#include "AliHLTTPCCADef.h"
-#include "AliHLTTPCCATracker.h"
-#include "AliHLTLogging.h"
-#include "AliHLTTPCCASliceOutput.h"
-
-class AliHLTTPCCARow;
-
-class AliHLTTPCCAGPUTrackerNVCC : public AliHLTTPCCAGPUTracker, public AliHLTLogging
-{
-public:
-       AliHLTTPCCAGPUTrackerNVCC();
-       virtual ~AliHLTTPCCAGPUTrackerNVCC();
-
-       virtual int InitGPU(int sliceCount = 12, int forceDeviceID = -1);
-       virtual int Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
-       virtual int ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
-       virtual int ExitGPU();
-
-       virtual void SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile = NULL);
-       virtual int SetGPUTrackerOption(char* OptionName, int OptionValue);
-
-       virtual unsigned long long int* PerfTimer(int iSlice, unsigned int i);
-
-       virtual int InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param);
-       virtual void SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val);
-
-       virtual const AliHLTTPCCASliceOutput::outputControlStruct* OutputControl() const;
-       virtual int GetSliceCount() const;
-
-private:
-       static void* RowMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + iSlice * sizeof(AliHLTTPCCARow) * (HLTCA_ROW_COUNT + 1) ); }
-       static void* CommonMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + iSlice * AliHLTTPCCATracker::CommonMemorySize() ); }
-       static void* SliceDataMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + iSlice * HLTCA_GPU_SLICE_DATA_MEMORY ); }
-       void* GlobalMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_GLOBAL_MEMORY ); }
-       void* TracksMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_TRACKS_MEMORY ); }
-       void* TrackerMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + iSlice * sizeof(AliHLTTPCCATracker) ); }
-
-       void DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check = true);
-       int GetThread();
-       void ReleaseGlobalLock(void* sem);
-       int CheckMemorySizes(int sliceCount);
-
-       int CUDASync(char* state = "UNKNOWN", int sliceLocal = 0, int slice = 0);
-       template <class T> T* alignPointer(T* ptr, int alignment);
-       void StandalonePerfTime(int iSlice, int i);
-#ifdef HLTCA_GPUCODE
-       bool CudaFailedMsg(cudaError_t error);
-#endif //HLTCA_GPUCODE
-
-       AliHLTTPCCATracker *fGpuTracker; //Tracker Objects that will be used on the GPU
-       void* fGPUMemory; //Pointer to GPU Memory Base Adress
-       void* fHostLockedMemory; //Pointer to Base Adress of Page Locked Host Memory for DMA Transfer
-
-       int fDebugLevel;                        //Debug Level for GPU Tracker
-       unsigned int fDebugMask;        //Mask which Debug Data is written to file
-       std::ostream* fOutFile;         //Debug Output Stream Pointer
-       unsigned long long int fGPUMemSize;     //Memory Size to allocate on GPU
-
-       void* fpCudaStreams; //Pointer to array of CUDA Streams
-       int fSliceCount; //Maximum Number of Slices this GPU tracker can process in parallel
-
-       static const int fgkNSlices = 36; //Number of Slices in Alice
-       AliHLTTPCCATracker fSlaveTrackers[fgkNSlices]; //CPU Slave Trackers for Initialization and Output
-
-       AliHLTTPCCASliceOutput::outputControlStruct* fOutputControl; //Output Control Structure
-       
-       static bool fgGPUUsed; //Flag signaling that a GPU tracker is initialized in a process
-       int fThreadId; //Thread ID that is valid for the local CUDA context
-       int fCudaInitialized; //Flag if CUDA is initialized
-
-       int fPPMode; //Flag if GPU tracker runs in PP Mode
-
-       // disable copy
-       AliHLTTPCCAGPUTrackerNVCC( const AliHLTTPCCAGPUTrackerNVCC& );
-       AliHLTTPCCAGPUTrackerNVCC &operator=( const AliHLTTPCCAGPUTrackerNVCC& );
-
-       ClassDef( AliHLTTPCCAGPUTrackerNVCC, 0 )
-};
-
-#ifdef R__WIN32
-#define DLL_EXPORT __declspec(dllexport)
-#else
-#define DLL_EXPORT
-#endif
-
-extern "C" DLL_EXPORT AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate();
-extern "C" DLL_EXPORT void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr);
-
-#endif //ALIHLTTPCCAGPUTRACKER_H
index c49b154b7919ff4a5e731183af1862903826c2e5..eb67d956b8d84767ed2b73d17a6daf86b4d37be9 100644 (file)
@@ -20,9 +20,9 @@
 #include "AliHLTArray.h"
 #include "AliHLTTPCCAHit.h"
 #include "AliHLTTPCCAParam.h"
-#include "MemoryAssignmentHelpers.h"
 #include "AliHLTTPCCAGPUConfig.h"
 #include "AliHLTTPCCAGPUTracker.h"
+#include "MemoryAssignmentHelpers.h"
 #include <iostream>
 
 // calculates an approximation for 1/sqrt(x)
index d822d90d664359752b8de221460b0cb7e906a419..2fcbe84acfc3e21ca918b84bea4ac08d9361fe5f 100644 (file)
@@ -18,8 +18,6 @@
 //***************************************************************************
 
 #include "AliHLTTPCCASliceOutput.h"
-#include "MemoryAssignmentHelpers.h"
-
 
 int AliHLTTPCCASliceOutput::EstimateSize( int nOfTracks, int nOfTrackClusters )
 {
index c8a3498df7a3aa6a1561ebefa5eb52fd693259cb..e51b9ccac4345a407b4590c76d049f7ca9a133c6 100644 (file)
@@ -43,13 +43,13 @@ GPUd() void AliHLTTPCCAStartHitsSorter::Thread
                if (iBlock == nBlocks - 1 && nCurrentBlock < gpuFixedBlockCount)
                {
                        startOffset += tracker.RowStartHitCountOffset()[ir].x;
-                       for (int i = previousBlockEndTracklet + HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS;i <= startOffset;i += HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+                       for (int i = previousBlockEndTracklet + HLTCA_GPU_THREAD_COUNT;i <= startOffset;i += HLTCA_GPU_THREAD_COUNT)
                        {
-                               if (previousBlockEndTracklet / (HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS) != i / (HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS))
+                               if (previousBlockEndTracklet / HLTCA_GPU_THREAD_COUNT != i / HLTCA_GPU_THREAD_COUNT)
                                {
                                        tracker.BlockStartingTracklet()[nCurrentBlock].x = previousBlockEndTracklet;
-                                       tracker.BlockStartingTracklet()[nCurrentBlock++].y = HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS;
-                                       previousBlockEndTracklet += HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS;
+                                       tracker.BlockStartingTracklet()[nCurrentBlock++].y = HLTCA_GPU_THREAD_COUNT;
+                                       previousBlockEndTracklet += HLTCA_GPU_THREAD_COUNT;
                                        if (nCurrentBlock == gpuFixedBlockCount)
                                        {
                                                break;
index 63e7737d9cdba1c9a336f54120be0428170854d8..e24d64457d5c18b681d18d88e00139eb46f28440 100644 (file)
@@ -310,28 +310,23 @@ int AliHLTTPCCATrackerComponent::Configure( const char* cdbEntry, const char* ch
   return iResult1 ? iResult1 : ( iResult2 ? iResult2 : ( iResult3 ? iResult3 : iResult4 ) );
 }
 
-
-
 int AliHLTTPCCATrackerComponent::DoInit( int argc, const char** argv )
 {
   // Configure the CA tracker component
 
   if ( fTracker ) return EINPROGRESS;
 
-
-  //fTracker = new AliHLTTPCCATrackerFramework();
-  //Do not initialize the TrackerFramework here since the CUDA framework is thread local and DoInit is called from different thread than DoEvent
-
   TString arguments = "";
   for ( int i = 0; i < argc; i++ ) {
     if ( !arguments.IsNull() ) arguments += " ";
     arguments += argv[i];
   }
 
-  return Configure( NULL, NULL, arguments.Data() );
+  int retVal = Configure( NULL, NULL, arguments.Data() );
+  if (retVal == 0) fTracker = new AliHLTTPCCATrackerFramework(fAllowGPU);
+  return(retVal);
 }
 
-
 int AliHLTTPCCATrackerComponent::DoDeinit()
 {
   // see header file for class documentation
@@ -340,8 +335,6 @@ int AliHLTTPCCATrackerComponent::DoDeinit()
   return 0;
 }
 
-
-
 int AliHLTTPCCATrackerComponent::Reconfigure( const char* cdbEntry, const char* chainId )
 {
   // Reconfigure the component from OCDB .
@@ -349,7 +342,6 @@ int AliHLTTPCCATrackerComponent::Reconfigure( const char* cdbEntry, const char*
   return Configure( cdbEntry, chainId, NULL );
 }
 
-
 bool AliHLTTPCCATrackerComponent::CompareClusters( AliHLTTPCSpacePointData *a, AliHLTTPCSpacePointData *b )
 {
   //* Comparison function for sort clusters
index 2e9e4ac4a8c854a3faea026a363369c312898ea2..a8c1865bab865b5d24c0fd38b4a1f793fb744b40 100644 (file)
@@ -50,7 +50,6 @@ int AliHLTTPCCATrackerFramework::InitGPU(int sliceCount, int forceDeviceID)
        if (fGPUTrackerAvailable && (retVal = ExitGPU())) return(retVal);
        retVal = fGPUTracker->InitGPU(sliceCount, forceDeviceID);
        fUseGPUTracker = fGPUTrackerAvailable = retVal == 0;
-       fGPUSliceCount = sliceCount;
        return(retVal);
 }
 
@@ -158,7 +157,7 @@ int AliHLTTPCCATrackerFramework::InitializeSliceParam(int iSlice, AliHLTTPCCAPar
 #define GPULIBNAME "libAliHLTTPCCAGPU"
 #endif
 
-AliHLTTPCCATrackerFramework::AliHLTTPCCATrackerFramework(int allowGPU) : fGPULibAvailable(false), fGPUTrackerAvailable(false), fUseGPUTracker(false), fGPUDebugLevel(0), fGPUSliceCount(0), fGPUTracker(NULL), fGPULib(NULL), fOutputControl( NULL ), fCPUSliceCount(fgkNSlices), fKeepData(false)
+AliHLTTPCCATrackerFramework::AliHLTTPCCATrackerFramework(int allowGPU) : fGPULibAvailable(false), fGPUTrackerAvailable(false), fUseGPUTracker(false), fGPUDebugLevel(0), fGPUTracker(NULL), fGPULib(NULL), fOutputControl( NULL ), fKeepData(false)
 {
        //Constructor
 #ifdef R__WIN32
@@ -204,14 +203,13 @@ AliHLTTPCCATrackerFramework::AliHLTTPCCATrackerFramework(int allowGPU) : fGPULib
                        fGPUTracker = tmp();
                        fGPULibAvailable = true;
                        fGPULib = (void*) (size_t) hGPULib;
-                       HLTImportant("GPU Tracker Created by Wrapper library");
+                       HLTImportant("GPU Tracker library loaded and GPU tracker object created sucessfully (%sactive)", allowGPU ? "" : "in");
                }
        }
 
        if (allowGPU && fGPULibAvailable)
        {
-               fUseGPUTracker = (fGPUTrackerAvailable= (fGPUTracker->InitGPU() == 0));
-               fGPUSliceCount = fGPUTrackerAvailable ? fGPUTracker->GetSliceCount() : 0;
+               fUseGPUTracker = (fGPUTrackerAvailable = (fGPUTracker->InitGPU() == 0));
                HLTInfo("GPU Tracker Initialized and available in framework");
        }
 }
index 580a90b3ae2f08c454f7a6a246d4b993a974def1..c0c9b8e6c32a97b280f08c5947aa1036c1a1bb3c 100644 (file)
@@ -44,7 +44,7 @@ public:
        int ProcessSlices(int firstSlice, int sliceCount, AliHLTTPCCAClusterData* pClusterData, AliHLTTPCCASliceOutput** pOutput);
        unsigned long long int* PerfTimer(int GPU, int iSlice, int iTimer);
 
-       int MaxSliceCount() const { return(fUseGPUTracker ? fGPUSliceCount : fCPUSliceCount); }
+       int MaxSliceCount() const { return(fUseGPUTracker ? (fGPUTrackerAvailable ? fGPUTracker->GetSliceCount() : 0) : fCPUSliceCount); }
        int GetGPUStatus() const { return(fGPUTrackerAvailable + fUseGPUTracker); }
 
        const AliHLTTPCCAParam& Param(int iSlice) const { return(fCPUTrackers[iSlice].Param()); }
@@ -59,14 +59,13 @@ private:
   bool fGPUTrackerAvailable; // Is the GPU Tracker Available?
   bool fUseGPUTracker; // use the GPU tracker 
   int fGPUDebugLevel;  // debug level for the GPU code
-  int fGPUSliceCount;  //How many slices to process parallel
   AliHLTTPCCAGPUTracker* fGPUTracker;  //Pointer to GPU Tracker Object
   void* fGPULib;               //Pointer to GPU Library
 
   AliHLTTPCCASliceOutput::outputControlStruct* fOutputControl;
 
   AliHLTTPCCATracker fCPUTrackers[fgkNSlices];
-  int fCPUSliceCount;
+  static const int fCPUSliceCount = 36;
 
   bool fKeepData;              //Keep temporary data and do not free memory imediately, used for Standalone Debug Event Display
 
index b737ab7a181483d8190c7d76713ad00f89f2f454..f8a8aafbe0777dd1ed4fc5d30c812ff938718a33 100644 (file)
 #include "AliHLTTPCCADef.h"
 #include "AliHLTTPCCATracklet.h"
 #include "AliHLTTPCCATrackletConstructor.h"
-#include "MemoryAssignmentHelpers.h"
-
-//#include "AliHLTTPCCAPerformance.h"
-//#include "TH1D.h"
-
-//#define DRAW
-
-#ifdef DRAW
-#include "AliHLTTPCCADisplay.h"
-#endif //DRAW
 
 #define kMaxRowGap 4
 
@@ -44,101 +34,6 @@ GPUdi() void AliHLTTPCCATrackletConstructor::InitTracklet( AliHLTTPCCATrackParam
   tParam.InitParam();
 }
 
-GPUdi() void AliHLTTPCCATrackletConstructor::ReadData
-#ifndef HLTCA_GPU_PREFETCHDATA
-( int /*iThread*/, AliHLTTPCCASharedMemory& /*s*/, AliHLTTPCCAThreadMemory& /*r*/, AliHLTTPCCATracker& /*tracker*/, int /*iRow*/ )
-{
-       //Prefetch Data to shared memory
-#else
-( int iThread, AliHLTTPCCASharedMemory& s, AliHLTTPCCAThreadMemory& r, AliHLTTPCCATracker& tracker, int iRow )
-{
-  // reconstruction of tracklets, read data step
-    const AliHLTTPCCARow &row = tracker.Row( iRow );
-    //bool jr = !r.fCurrentData;
-
-    // copy hits, grid content and links
-
-    // FIXME: inefficient copy
-    //const int numberOfHitsAligned = NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits());
-
-/*     
-#ifdef HLTCA_GPU_REORDERHITDATA
-    ushort2 *sMem1 = reinterpret_cast<ushort2 *>( s.fData[jr] );
-    for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
-      sMem1[i].x = tracker.HitDataY( row, i );
-      sMem1[i].y = tracker.HitDataZ( row, i );
-    }
-#else
-    ushort_v *sMem1 = reinterpret_cast<ushort_v *>( s.fData[jr] );
-    for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
-      sMem1[i] = tracker.HitDataY( row, i );
-    }
-
-    ushort_v *sMem1a = reinterpret_cast<ushort_v *>( s.fData[jr] ) + numberOfHitsAligned;
-    for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
-      sMem1a[i] = tracker.HitDataZ( row, i );
-    }
-#endif //HLTCA_GPU_REORDERHITDATA
-
-    short *sMem2 = reinterpret_cast<short *>( s.fData[jr] ) + 2 * numberOfHitsAligned;
-    for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
-      sMem2[i] = tracker.HitLinkUpData( row, i );
-    }
-       
-    unsigned short *sMem3 = reinterpret_cast<unsigned short *>( s.fData[jr] ) + 3 * numberOfHitsAligned;
-    const int n = row.FullSize(); // + grid content size
-    for ( int i = iThread; i < n; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
-      sMem3[i] = tracker.FirstHitInBin( row, i );
-    }*/
-
-       /*for (int k = 0;k < 2;k++)
-       {
-               HLTCA_GPU_ROWCOPY* sharedMem;
-               const HLTCA_GPU_ROWCOPY* sourceMem;
-               int copyCount;
-               switch (k)
-               {
-               case 0:
-                       sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.HitDataY(row) );
-                       sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
-                       copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
-                       break;
-               case 1:
-                       sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.HitDataZ(row) );
-                       sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
-                       copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
-                       break;
-               case 2:
-                       sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.HitLinkUpData(row) );
-                       sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
-                       copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
-                       break;
-               case 1:
-                       sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.FirstHitInBin(row) );
-                       sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
-                       copyCount = NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.FullSize()) * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
-                       break;
-               }
-               for (int i = iThread;i < copyCount;i += TRACKLET_CONSTRUCTOR_NMEMTHREDS)
-               {
-                       sharedMem[i] = sourceMem[i];
-               }
-       }*/
-
-       for (unsigned int i = iThread;i < row.FullSize() * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);i += TRACKLET_CONSTRUCTOR_NMEMTHREDS)
-       {
-               reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[!r.fCurrentData] ))[i] = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.FirstHitInBin(row) )[i];
-       }
-
-       const HLTCA_GPU_ROWCOPY* const sourceMem = (const HLTCA_GPU_ROWCOPY *) &row;
-       HLTCA_GPU_ROWCOPY* const sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> ( &s.fRow[!r.fCurrentData] );
-       for (unsigned int i = iThread;i < sizeof(AliHLTTPCCARow) / sizeof(HLTCA_GPU_ROWCOPY);i += TRACKLET_CONSTRUCTOR_NMEMTHREDS)
-       {
-               sharedMem[i] = sourceMem[i];
-       }
-#endif //!HLTCA_GPU_PREFETCHDATA
-}
-
 
 GPUdi() void AliHLTTPCCATrackletConstructor::StoreTracklet
 ( int /*nBlocks*/, int /*nThreads*/, int /*iBlock*/, int /*iThread*/,
@@ -152,13 +47,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::StoreTracklet
 {
   // reconstruction of tracklets, tracklet store step
 
-  //AliHLTTPCCAPerformance::Instance().HNHitsPerTrackCand()->Fill(r.fNHits);
-
   do {
-    {
-       //std::cout<<"tracklet to store: "<<r.fItr<<", nhits = "<<r.fNHits<<std::endl;
-    }
-
     if ( r.fNHits < TRACKLET_SELECTOR_MIN_HITS ) {
       r.fNHits = 0;
       break;
@@ -194,14 +83,6 @@ GPUdi() void AliHLTTPCCATrackletConstructor::StoreTracklet
   tracklet.SetNHits( r.fNHits );
 
   if ( r.fNHits > 0 ) {
-#ifdef DRAW
-    if ( 0 ) {
-      std::cout << "store tracklet " << r.fItr << ", nhits = " << r.fNHits << std::endl;
-      if ( AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kBlue, 1. ) ) {
-        AliHLTTPCCADisplay::Instance().Ask();
-      }
-    }
-#endif //DRAW
     if ( CAMath::Abs( tParam.Par()[4] ) < 1.e-4 ) tParam.SetPar( 4, 1.e-4 );
        if (r.fStartRow < r.fFirstRow) r.fFirstRow = r.fStartRow;
        tracklet.SetFirstRow( r.fFirstRow );
@@ -219,11 +100,11 @@ GPUdi() void AliHLTTPCCATrackletConstructor::StoreTracklet
          int ih = tracklet.RowHit( iRow );
 #endif //EXTERN_ROW_HITS
       if ( ih >= 0 ) {
-#if defined(HLTCA_GPUCODE) & !defined(HLTCA_GPU_PREFETCHDATA)
+#if defined(HLTCA_GPUCODE)
            tracker.MaximizeHitWeight( s.fRows[ iRow ], ih, w );
 #else
            tracker.MaximizeHitWeight( tracker.Row( iRow ), ih, w );
-#endif //HLTCA_GPUCODE & !HLTCA_GPU_PREFETCHDATA
+#endif //HLTCA_GPUCODE
       }
     }
   }
@@ -242,24 +123,17 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
 {
   // reconstruction of tracklets, tracklets update step
 
-  //std::cout<<"Update tracklet: "<<r.fItr<<" "<<r.fGo<<" "<<r.fStage<<" "<<iRow<<std::endl;
-  bool drawSearch = 0;//r.fItr==2;
-  bool drawFit = 0;//r.fItr==2;
-  bool drawFitted = drawFit ;//|| 1;//r.fItr==16;
-
   if ( !r.fGo ) return;
 
 #ifndef EXTERN_ROW_HITS
   AliHLTTPCCATracklet &tracklet = tracker.Tracklets()[r.fItr];
 #endif //EXTERN_ROW_HITS
 
-#ifdef HLTCA_GPU_PREFETCHDATA
-  const AliHLTTPCCARow &row = s.fRow[r.fCurrentData];
-#elif defined(HLTCA_GPUCODE)
+#if defined(HLTCA_GPUCODE)
   const AliHLTTPCCARow &row = s.fRows[iRow];
 #else
   const AliHLTTPCCARow &row = tracker.Row( iRow );
-#endif //HLTCA_GPU_PREFETCHDATA
+#endif //HLTCA_GPUCODE
 
   float y0 = row.Grid().YMin();
   float stepY = row.HstepY();
@@ -283,42 +157,23 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
          }
 
 
-//#ifdef HLTCA_GPU_PREFETCHDATA
-//      uint4 *tmpint4 = s.fData[r.fCurrentData];
-//#endif
          ushort2 hh;
-//#ifdef HLTCA_GPU_REORDERHITDATA
-//      hh = reinterpret_cast<ushort2*>( tmpint4 )[r.fCurrIH];
-//#else
-//#ifdef HLTCA_GPU_PREFETCHDATA
-//       hh.x = reinterpret_cast<ushort_v*>( tmpint4 )[r.fCurrIH];
-//       hh.y = reinterpret_cast<ushort_v*>( tmpint4 )[NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits()) + r.fCurrIH];
-//#else
 #if defined(HLTCA_GPU_TEXTURE_FETCH)
          hh = tex1Dfetch(gAliTexRefu2, ((char*) tracker.Data().HitData() - tracker.Data().GPUTextureBase()) / sizeof(ushort2) + row.HitNumberOffset() + r.fCurrIH);
 #else
          hh = tracker.HitData(row)[r.fCurrIH];
 #endif //HLTCA_GPU_TEXTURE_FETCH
-//#endif
-//#endif
 
       int oldIH = r.fCurrIH;
-//#ifdef HLTCA_GPU_PREFETCHDATA
-//      r.fCurrIH = reinterpret_cast<short*>( tmpint4 )[2 * NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits()) + r.fCurrIH]; // read from linkup data
-//#else
 #if defined(HLTCA_GPU_TEXTURE_FETCH)
          r.fCurrIH = tex1Dfetch(gAliTexRefs, ((char*) tracker.Data().HitLinkUpData(row) - tracker.Data().GPUTextureBase()) / sizeof(unsigned short) + r.fCurrIH);
 #else
          r.fCurrIH = tracker.HitLinkUpData(row)[r.fCurrIH]; // read from linkup data
 #endif //HLTCA_GPU_TEXTURE_FETCH
-//#endif
 
       float x = row.X();
       float y = y0 + hh.x * stepY;
       float z = z0 + hh.y * stepZ;
-#ifdef DRAW
-      if ( drawFit ) std::cout << " fit tracklet: new hit " << oldIH << ", xyz=" << x << " " << y << " " << z << std::endl;
-#endif //DRAW
 
       if ( iRow == r.fStartRow ) {
         tParam.SetX( x );
@@ -326,9 +181,6 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
         tParam.SetZ( z );
         r.fLastY = y;
         r.fLastZ = z;
-        #ifdef DRAW
-        if ( drawFit ) std::cout << " fit tracklet " << r.fItr << ", row " << iRow << " first row" << std::endl;
-        #endif //DRAW
       } else {
 
         float err2Y, err2Z;
@@ -349,13 +201,6 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
           tParam.SetCov( 0, err2Y );
           tParam.SetCov( 2, err2Z );
         }
-        if ( drawFit ) {
-          #ifdef DRAW
-          std::cout << " fit tracklet " << r.fItr << ", row " << iRow << " transporting.." << std::endl;
-          std::cout << " params before transport=" << std::endl;
-          tParam.Print();
-          #endif //DRAW
-        }
         float sinPhi, cosPhi;
         if ( r.fNHits >= 10 && CAMath::Abs( tParam.SinPhi() ) < .99 ) {
           sinPhi = tParam.SinPhi();
@@ -364,13 +209,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
           sinPhi = dy * ri;
           cosPhi = dx * ri;
         }
-        #ifdef DRAW
-        if ( drawFit ) std::cout << "sinPhi0 = " << sinPhi << ", cosPhi0 = " << cosPhi << std::endl;
-        #endif //DRAW
         if ( !tParam.TransportToX( x, sinPhi, cosPhi, tracker.Param().ConstBz(), -1 ) ) {
-          #ifdef DRAW
-          if ( drawFit ) std::cout << " tracklet " << r.fItr << ", row " << iRow << ": can not transport!!" << std::endl;
-                 #endif //DRAW
 #ifndef EXTERN_ROW_HITS
           tracklet.SetRowHit( iRow, -1 );
 #else
@@ -378,24 +217,9 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
 #endif //EXTERN_ROW_HITS
           break;
         }
-        //std::cout<<"mark1 "<<r.fItr<<std::endl;
-        //tParam.Print();
         tracker.GetErrors2( iRow, tParam.GetZ(), sinPhi, cosPhi, tParam.GetDzDs(), err2Y, err2Z );
-        //std::cout<<"mark2"<<std::endl;
-
-        if ( drawFit ) {
-          #ifdef DRAW
-          std::cout << " params after transport=" << std::endl;
-          tParam.Print();
-          std::cout << "fit tracklet before filter: " << r.fItr << ", row " << iRow << " errs=" << err2Y << " " << err2Z << std::endl;
-          AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kBlue, 2., 1 );
-          AliHLTTPCCADisplay::Instance().Ask();
-                 #endif //DRAW
-        }
+
         if ( !tParam.Filter( y, z, err2Y, err2Z, .99 ) ) {
-          #ifdef DRAW
-          if ( drawFit ) std::cout << " tracklet " << r.fItr << ", row " << iRow << ": can not filter!!" << std::endl;
-          #endif //DRAW
 #ifndef EXTERN_ROW_HITS
           tracklet.SetRowHit( iRow, -1 );
 #else
@@ -409,14 +233,6 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
 #else
          tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = oldIH;
 #endif //!EXTERN_ROW_HITS
-      if ( drawFit ) {
-        #ifdef DRAW
-        std::cout << "fit tracklet after filter " << r.fItr << ", row " << iRow << std::endl;
-        tParam.Print();
-        AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kGreen, 2. );
-        AliHLTTPCCADisplay::Instance().Ask();
-               #endif //DRAW
-      }
       r.fNHits++;
       r.fLastRow = iRow;
       r.fEndRow = iRow;
@@ -424,36 +240,15 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
     } while ( 0 );
 
     if ( r.fCurrIH < 0 ) {
-      #ifdef DRAW
-      if ( drawFitted ) std::cout << "fitted tracklet " << r.fItr << ", nhits=" << r.fNHits << std::endl;
-      #endif //DRAW
       r.fStage = 1;
-      //AliHLTTPCCAPerformance::Instance().HNHitsPerSeed()->Fill(r.fNHits);
       if ( CAMath::Abs( tParam.SinPhi() ) > .999 ) {
-        #ifdef DRAW
-        if ( drawFitted ) std::cout << " fitted tracklet  error: sinPhi=" << tParam.SinPhi() << std::endl;
-        #endif //DRAW
         r.fNHits = 0; r.fGo = 0;
       } else {
         //tParam.SetCosPhi( CAMath::Sqrt(1-tParam.SinPhi()*tParam.SinPhi()) );
       }
-      if ( drawFitted ) {
-        #ifdef DRAW
-        std::cout << "fitted tracklet " << r.fItr << " miss=" << r.fNMissed << " go=" << r.fGo << std::endl;
-        tParam.Print();
-        AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kBlue );
-        AliHLTTPCCADisplay::Instance().Ask();
-               #endif //DRAW
-      }
     }
   } else { // forward/backward searching part
     do {
-      if ( drawSearch ) {
-        #ifdef DRAW
-        std::cout << "search tracklet " << r.fItr << " row " << iRow << " miss=" << r.fNMissed << " go=" << r.fGo << " stage=" << r.fStage << std::endl;
-        #endif //DRAW
-      }
-
       if ( r.fStage == 2 && ( ( iRow >= r.fEndRow ) ||
                               ( iRow >= r.fStartRow && ( iRow - r.fStartRow ) % 2 == 0 )
                             ) ) break;
@@ -465,16 +260,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
 
       float x = row.X();
       float err2Y, err2Z;
-      if ( drawSearch ) {
-        #ifdef DRAW
-        std::cout << "tracklet " << r.fItr << " before transport to row " << iRow << " : " << std::endl;
-        tParam.Print();
-        #endif //DRAW
-      }
       if ( !tParam.TransportToX( x, tParam.SinPhi(), tParam.GetCosPhi(), tracker.Param().ConstBz(), .99 ) ) {
-        #ifdef DRAW
-        if ( drawSearch ) std::cout << " tracklet " << r.fItr << ", row " << iRow << ": can not transport!!" << std::endl;
-        #endif //DRAW
 #ifndef EXTERN_ROW_HITS
                tracklet.SetRowHit(iRow, -1);
 #else
@@ -491,30 +277,10 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
 #endif //!EXTERN_ROW_HITS
         break;
       }
-      if ( drawSearch ) {
-               #ifdef DRAW
-        std::cout << "tracklet " << r.fItr << " after transport to row " << iRow << " : " << std::endl;
-        tParam.Print();
-        AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kBlue, 2., 1 );
-        AliHLTTPCCADisplay::Instance().Ask();
-               #endif //DRAW
-      }
-#ifdef HLTCA_GPU_PREFETCHDATA
-      uint4 *tmpint4 = s.fData[r.fCurrentData];
-#endif //HLTCA_GPU_PREFETCHDATA
-
-//#ifdef HLTCA_GPU_REORDERHITDATA
-//      const ushort2 *hits = reinterpret_cast<ushort2*>( tmpint4 );
-//#else
-//#ifdef HLTCA_GPU_PREFETCHDATA
-//       const ushort_v *hitsx = reinterpret_cast<ushort_v*>( tmpint4 );
-//       const ushort_v *hitsy = reinterpret_cast<ushort_v*>( tmpint4 ) + NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits());
-//#else
+
 #ifndef HLTCA_GPU_TEXTURE_FETCH
          const ushort2 *hits = tracker.HitData(row);
 #endif //!HLTCA_GPU_TEXTURE_FETCH
-//#endif
-//#endif
 
       float fY = tParam.GetY();
       float fZ = tParam.GetZ();
@@ -532,22 +298,12 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
 
         unsigned int fHitYfst = 1, fHitYlst = 0, fHitYfst1 = 1, fHitYlst1 = 0;
 
-        if ( drawSearch ) {
-#ifdef DRAW
-          std::cout << " tracklet " << r.fItr << ", row " << iRow << ": grid N=" << row.Grid().N() << std::endl;
-          std::cout << " tracklet " << r.fItr << ", row " << iRow << ": minbin=" << fIndYmin << std::endl;
-#endif //DRAW
-        }
         {
           int nY = row.Grid().Ny();
 
-//#ifdef HLTCA_GPU_PREFETCHDATA
-//               const unsigned short *sGridP = ( reinterpret_cast<unsigned short*>( tmpint4 ) );
-//#else
 #ifndef HLTCA_GPU_TEXTURE_FETCH
                  const unsigned short *sGridP = tracker.FirstHitInBin(row);
 #endif //!HLTCA_GPU_TEXTURE_FETCH
-//#endif
 
 #ifdef HLTCA_GPU_TEXTURE_FETCH
                  fHitYfst = tex1Dfetch(gAliTexRefu, ((char*) tracker.Data().FirstHitInBin(row) - tracker.Data().GPUTextureBase()) / sizeof(unsigned short) + fIndYmin);
@@ -564,37 +320,9 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
           assert( (signed) fHitYlst <= row.NHits() );
           assert( (signed) fHitYfst1 <= row.NHits() );
           assert( (signed) fHitYlst1 <= row.NHits() );
-          if ( drawSearch ) {
-#ifdef DRAW
-            std::cout << " Grid, row " << iRow << ": nHits=" << row.NHits() << ", grid n=" << row.Grid().N() << ", c[n]=" << sGridP[row.Grid().N()] << std::endl;
-            std::cout << "hit steps = " << stepY << " " << stepZ << std::endl;
-            std::cout << " Grid bins:" << std::endl;
-            for ( unsigned int i = 0; i < row.Grid().N(); i++ ) {
-              std::cout << " bin " << i << ": ";
-              for ( int j = sGridP[i]; j < sGridP[i+1]; j++ ) {
-                ushort2 hh = hits[j];
-                float y = y0 + hh.x * stepY;
-                float z = z0 + hh.y * stepZ;
-                std::cout << "[" << j << "|" << y << "," << z << "] ";
-              }
-              std::cout << std::endl;
-            }
-#endif //DRAW
-          }
-#ifdef DRAW
-          if ( sGridP[row.Grid().N()] != row.NHits() ) {
-            std::cout << " grid, row " << iRow << ": nHits=" << row.NHits() << ", grid n=" << row.Grid().N() << ", c[n]=" << sGridP[row.Grid().N()] << std::endl;
-            //exit(0);
-          }
-#endif //DRAW
-        }
-#ifdef DRAW
-        if ( drawSearch ) {
-          std::cout << " tracklet " << r.fItr << ", row " << iRow << ", yz= " << fY << "," << fZ << ": search hits=" << fHitYfst << " " << fHitYlst << " / " << fHitYfst1 << " " << fHitYlst1 << std::endl;
-          std::cout << " hit search :" << std::endl;
         }
-#endif //DRAW
-        for ( unsigned int fIh = fHitYfst; fIh < fHitYlst; fIh++ ) {
+
+               for ( unsigned int fIh = fHitYfst; fIh < fHitYlst; fIh++ ) {
           assert( (signed) fIh < row.NHits() );
           ushort2 hh;
 #if defined(HLTCA_GPU_TEXTURE_FETCH)
@@ -605,11 +333,6 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
           int ddy = ( int )( hh.x ) - fY0;
           int ddz = ( int )( hh.y ) - fZ0;
           int dds = CAMath::Abs( ddy ) + CAMath::Abs( ddz );
-          if ( drawSearch ) {
-            #ifdef DRAW
-            std::cout << fIh << ": hityz= " << hh.x << " " << hh.y << "(" << hh.x*stepY << " " << hh.y*stepZ << "), trackyz=" << fY0 << " " << fZ0 << "(" << fY0*stepY << " " << fZ0*stepZ << "), dy,dz,ds= " << ddy << " " << ddz << " " << dds << "(" << ddy*stepY << " " << ddz*stepZ << std::endl;
-            #endif //DRAW
-          }
           if ( dds < ds ) {
             ds = dds;
             best = fIh;
@@ -626,11 +349,6 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
           int ddy = ( int )( hh.x ) - fY0;
           int ddz = ( int )( hh.y ) - fZ0;
           int dds = CAMath::Abs( ddy ) + CAMath::Abs( ddz );
-          if ( drawSearch ) {
-            #ifdef DRAW
-            std::cout << fIh << ": hityz= " << hh.x << " " << hh.y << "(" << hh.x*stepY << " " << hh.y*stepZ << "), trackyz=" << fY0 << " " << fZ0 << "(" << fY0*stepY << " " << fZ0*stepZ << "), dy,dz,ds= " << ddy << " " << ddz << " " << dds << "(" << ddy*stepY << " " << ddz*stepZ << std::endl;
-            #endif //DRAW
-          }
           if ( dds < ds ) {
             ds = dds;
             best = fIh;
@@ -647,15 +365,6 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
 #endif //!EXTERN_ROW_HITS
                  break;
          }
-      if ( drawSearch ) {
-        #ifdef DRAW
-        std::cout << "hit search " << r.fItr << ", row " << iRow << " hit " << best << " found" << std::endl;
-        AliHLTTPCCADisplay::Instance().DrawSliceHit( iRow, best, kRed, 1. );
-        AliHLTTPCCADisplay::Instance().Ask();
-        AliHLTTPCCADisplay::Instance().DrawSliceHit( iRow, best, kWhite, 1 );
-        AliHLTTPCCADisplay::Instance().DrawSliceHit( iRow, best );
-               #endif //DRAW
-      }
 
       ushort2 hh;
 #if defined(HLTCA_GPU_TEXTURE_FETCH)
@@ -664,10 +373,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
                  hh = hits[best];
 #endif //HLTCA_GPU_TEXTURE_FETCH
 
-      //std::cout<<"mark 3, "<<r.fItr<<std::endl;
-      //tParam.Print();
       tracker.GetErrors2( iRow, *( ( AliHLTTPCCATrackParam* )&tParam ), err2Y, err2Z );
-      //std::cout<<"mark 4"<<std::endl;
 
       float y = y0 + hh.x * stepY;
       float z = z0 + hh.y * stepZ;
@@ -680,18 +386,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
       if ( sy2 > 2. ) sy2 = 2.;
       if ( sz2 > 2. ) sz2 = 2.;
 
-      if ( drawSearch ) {
-        #ifdef DRAW
-        std::cout << "dy,sy= " << dy << " " << CAMath::Sqrt( sy2 ) << ", dz,sz= " << dz << " " << CAMath::Sqrt( sz2 ) << std::endl;
-        std::cout << "dy,dz= " << dy << " " << dz << ", sy,sz= " << CAMath::Sqrt( sy2 ) << " " << CAMath::Sqrt( sz2 ) << ", sy,sz= " << CAMath::Sqrt( kFactor*( tParam.GetErr2Y() +  err2Y ) ) << " " << CAMath::Sqrt( kFactor*( tParam.GetErr2Z() +  err2Z ) ) << std::endl;
-        #endif //DRAW
-      }
       if ( CAMath::FMulRZ( dy, dy ) > sy2 || CAMath::FMulRZ( dz, dz ) > sz2  ) {
-        if ( drawSearch ) {
-          #ifdef DRAW
-          std::cout << "found hit is out of the chi2 window\n " << std::endl;
-          #endif //DRAW
-        }
 #ifndef EXTERN_ROW_HITS
                tracklet.SetRowHit(iRow, -1);
 #else
@@ -699,18 +394,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
 #endif //!EXTERN_ROW_HITS
         break;
       }
-#ifdef DRAW
-      //if( SAVE() ) hitstore[ iRow ] = best;
-      //std::cout<<"hit search before filter: "<<r.fItr<<", row "<<iRow<<std::endl;
-      //AliHLTTPCCADisplay::Instance().DrawTracklet(tParam, hitstore, kBlue);
-      //AliHLTTPCCADisplay::Instance().Ask();
-#endif //DRAW
       if ( !tParam.Filter( y, z, err2Y, err2Z, .99 ) ) {
-        if ( drawSearch ) {
-          #ifdef DRAW
-          std::cout << "tracklet " << r.fItr << " at row " << iRow << " : can not filter!!!! " << std::endl;
-          #endif //DRAW
-        }
         break;
       }
 #ifndef EXTERN_ROW_HITS
@@ -718,14 +402,6 @@ GPUdi() void AliHLTTPCCATrackletConstructor::UpdateTracklet
 #else
          tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = best;
 #endif //!EXTERN_ROW_HITS
-      if ( drawSearch ) {
-        #ifdef DRAW
-        std::cout << "tracklet " << r.fItr << " after filter at row " << iRow << " : " << std::endl;
-        tParam.Print();
-        AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kRed );
-        AliHLTTPCCADisplay::Instance().Ask();
-               #endif //DRAW
-      }
       r.fNHits++;
       r.fNMissed = 0;
       if ( r.fStage == 1 ) r.fLastRow = iRow;
@@ -781,7 +457,7 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr
        //Fetch a new trackled to be processed by this thread
        __syncthreads();
        int nextTracketlFirstRun = sMem.fNextTrackletFirstRun;
-       if (threadIdx.x  == TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+       if (threadIdx.x == 0)
        {
                sMem.fNTracklets = *tracker.NTracklets();
                if (sMem.fNextTrackletFirstRun)
@@ -805,14 +481,13 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr
                                else
                                {
                                        sMem.fNextTrackletFirst = nTracklet.x;
-                                       sMem.fNextTrackletNoDummy = 1;
                                }
                        }
 #endif //HLTCA_GPU_SCHED_FIXED_START
                }
                else
                {
-                       const int nFetchTracks = CAMath::Max(CAMath::Min((*tracker.RowBlockPos(Reverse, RowBlock)).x - (*tracker.RowBlockPos(Reverse, RowBlock)).y, HLTCA_GPU_THREAD_COUNT - TRACKLET_CONSTRUCTOR_NMEMTHREDS), 0);
+                       const int nFetchTracks = CAMath::Max(CAMath::Min((*tracker.RowBlockPos(Reverse, RowBlock)).x - (*tracker.RowBlockPos(Reverse, RowBlock)).y, HLTCA_GPU_THREAD_COUNT), 0);
                        sMem.fNextTrackletCount = nFetchTracks;
                        const int nUseTrack = nFetchTracks ? CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).y, nFetchTracks) : 0;
                        sMem.fNextTrackletFirst = nUseTrack;
@@ -830,7 +505,6 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr
                                        tracker.RowBlockTracklets(Reverse, RowBlock)[(nStartFillTrack + i) % HLTCA_GPU_MAX_TRACKLETS] = -3;     //Dummy filling track
                                }
                        }
-                       sMem.fNextTrackletNoDummy = 0;
                }
        }
        __syncthreads();
@@ -839,25 +513,19 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr
        {
                return(-2);             //No more track in this RowBlock
        }
-#if HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS > 0
-       else if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS)
-       {
-               return(-1);
-       }
-#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS > 0
-       else if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS >= sMem.fNextTrackletCount)
+       else if (threadIdx.x >= sMem.fNextTrackletCount)
        {
                return(-1);             //No track in this RowBlock for this thread
        }
        else if (nextTracketlFirstRun)
        {
-               if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS) sMem.fNextTrackletFirstRun = 0;
+               if (threadIdx.x == 0) sMem.fNextTrackletFirstRun = 0;
                mustInit = 1;
-               return(sMem.fNextTrackletFirst + threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS);
+               return(sMem.fNextTrackletFirst + threadIdx.x);
        }
        else
        {
-               const int nTrackPos = sMem.fNextTrackletFirst + threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS;
+               const int nTrackPos = sMem.fNextTrackletFirst + threadIdx.x;
                mustInit = (nTrackPos < tracker.RowBlockPos(Reverse, RowBlock)->w);
                volatile int* const ptrTracklet = &tracker.RowBlockTracklets(Reverse, RowBlock)[nTrackPos % HLTCA_GPU_MAX_TRACKLETS];
                int nTracklet;
@@ -893,7 +561,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
        GPUshared() AliHLTTPCCASharedMemory sMem;
 
 #ifdef HLTCA_GPU_SCHED_FIXED_START
-       if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+       if (threadIdx.x == 0)
        {
                sMem.fNextTrackletFirstRun = 1;
        }
@@ -901,7 +569,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
 #endif //HLTCA_GPU_SCHED_FIXED_START
 
 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-       if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+       if (threadIdx.x == 0)
        {
                sMem.fMaxSync = 0;
        }
@@ -938,7 +606,6 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                                        __syncthreads();
                                        threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / blockDim.x / gridDim.x);
 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-#ifndef HLTCA_GPU_PREFETCHDATA
                                        if (!sharedRowsInitialized)
                                        {
                                                for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
@@ -947,14 +614,13 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                                                }
                                                sharedRowsInitialized = 1;
                                        }
-#endif //!HLTCA_GPU_PREFETCHDATA
 #ifdef HLTCA_GPU_RESCHED
                                        short2 storeToRowBlock;
                                        int storePosition = 0;
-                                       if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
+                                       if (threadIdx.x < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
                                        {
-                                               const int nReverse = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
-                                               const int nRowBlock = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nReverse = threadIdx.x / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nRowBlock = threadIdx.x % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
                                                sMem.fTrackletStoreCount[nReverse][nRowBlock] = 0;
                                        }
 #endif //HLTCA_GPU_RESCHED
@@ -988,22 +654,12 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                                        {
                                                CopyTrackletTempData( rMemGlobal, rMem, tParamGlobal, tParam );
                                        }
-#ifdef HLTCA_GPU_PREFETCHDATA
-                                       else if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS)
-                                       {
-                                               ReadData(threadIdx.x, sMem, rMem, tracker, iReverse ? (HLTCA_ROW_COUNT - 1 - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP) : (CAMath::Max(1, iRowBlock * HLTCA_GPU_SCHED_ROW_STEP)));
-                                       }
-#endif //HLTCA_GPU_PREFETCHDATA
                                        rMem.fItr = iTracklet;
                                        rMem.fGo = (iTracklet >= 0);
 
 #ifdef HLTCA_GPU_RESCHED
                                        storeToRowBlock.x = iRowBlock + 1;
                                        storeToRowBlock.y = iReverse;
-#ifdef HLTCA_GPU_PREFETCHDATA
-                                       rMem.fCurrentData ^= 1;
-                                       __syncthreads();
-#endif //HLTCA_GPU_PREFETCHDATA
                                        if (iReverse)
                                        {
                                                for (int j = HLTCA_ROW_COUNT - 1 - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP;j >= CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP);j--)
@@ -1012,28 +668,15 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                                                        if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(j >= rMem.fEndRow || ( j >= rMem.fStartRow && j - rMem.fStartRow % 2 == 0)))
                                                                pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-#ifdef HLTCA_GPU_PREFETCHDATA
-                                                       if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS && j > CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP))
-                                                       {
-                                                               ReadData(threadIdx.x, sMem, rMem, tracker, j - 1);
-                                                       }
-                                                       else
-#endif //HLTCA_GPU_PREFETCHDATA
                                                        if (iTracklet >= 0)
                                                        {
                                                                UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
                                                                if (rMem.fNMissed > kMaxRowGap && j <= rMem.fStartRow)
                                                                {
                                                                        rMem.fGo = 0;
-#ifndef HLTCA_GPU_PREFETCHDATA
                                                                        break;
-#endif //!HLTCA_GPU_PREFETCHDATA
                                                                }
                                                        }
-#ifdef HLTCA_GPU_PREFETCHDATA
-                                                       __syncthreads();
-                                                       rMem.fCurrentData ^= 1;
-#endif //HLTCA_GPU_PREFETCHDATA
                                                }
                                                        
                                                if (iTracklet >= 0 && (!rMem.fGo || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP))
@@ -1049,32 +692,15 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
                                                        if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && j >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (j - rMem.fStartRow) % 2 == 0 ))
                                                                pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-#ifdef HLTCA_GPU_PREFETCHDATA
-                                                       if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS && j < CAMath::Min((iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP, HLTCA_ROW_COUNT) - 1)
-                                                       {
-                                                               ReadData(threadIdx.x, sMem, rMem, tracker, j + 1);
-                                                       }
-                                                       else
-#endif //HLTCA_GPU_PREFETCHDATA
                                                        if (iTracklet >= 0)
                                                        {
                                                                UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
-#ifndef HLTCA_GPU_PREFETCHDATA
                                                                //if (rMem.fNMissed > kMaxRowGap || rMem.fGo == 0) break;       //DR!!! CUDA Crashes with this enabled
-#endif //!HLTCA_GPU_PREFETCHDATA
                                                        }
-#ifdef HLTCA_GPU_PREFETCHDATA
-                                                       __syncthreads();
-                                                       rMem.fCurrentData ^= 1;
-#endif //HLTCA_GPU_PREFETCHDATA
                                                }
                                                if (rMem.fGo && (rMem.fNMissed > kMaxRowGap || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP))
                                                {
-#if defined(HLTCA_GPU_PREFETCHDATA)
-                                                       if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) )
-#else
                                                        if ( !tParam.TransportToX( sMem.fRows[ rMem.fEndRow ].X(), tracker.Param().ConstBz(), .999 ) )
-#endif //HLTCA_GPU_PREFETCHDATA
                                                        {
                                                                rMem.fGo = 0;
                                                        }
@@ -1142,10 +768,10 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A
 
 #ifdef HLTCA_GPU_RESCHED
                                        __syncthreads();
-                                       if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
+                                       if (threadIdx.x < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
                                        {
-                                               const int nReverse = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
-                                               const int nRowBlock = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nReverse = threadIdx.x / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nRowBlock = threadIdx.x % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
                                                if (sMem.fTrackletStoreCount[nReverse][nRowBlock])
                                                {
                                                        sMem.fTrackletStoreCount[nReverse][nRowBlock] = CAMath::AtomicAdd(&tracker.RowBlockPos(nReverse, nRowBlock)->x, sMem.fTrackletStoreCount[nReverse][nRowBlock]);
index 28fd95c99031c84a97792d7b611ea5ff1f117b36..c24d331c68affa7e62c6f4e25c621da848003fb5 100644 (file)
@@ -35,27 +35,20 @@ class AliHLTTPCCATrackletConstructor
       public:
 #if !defined(HLTCA_GPUCODE)
         AliHLTTPCCASharedMemory()
-                       : fNextTrackletFirst(0), fNextTrackletCount(0), fNextTrackletNoDummy(0), fNextTrackletStupidDummy(0), fNextTrackletFirstRun(0), fNTracklets(0), fSliceDone(0) {}
+                       : fNextTrackletFirst(0), fNextTrackletCount(0), fNextTrackletStupidDummy(0), fNextTrackletFirstRun(0), fNTracklets(0) {}
 
         AliHLTTPCCASharedMemory( const AliHLTTPCCASharedMemory& /*dummy*/ )
-                       : fNextTrackletFirst(0), fNextTrackletCount(0), fNextTrackletNoDummy(0), fNextTrackletStupidDummy(0), fNextTrackletFirstRun(0), fNTracklets(0), fSliceDone(0) {}
+                       : fNextTrackletFirst(0), fNextTrackletCount(0), fNextTrackletStupidDummy(0), fNextTrackletFirstRun(0), fNTracklets(0) {}
         AliHLTTPCCASharedMemory& operator=( const AliHLTTPCCASharedMemory& /*dummy*/ ) { return *this; }
 #endif //HLTCA_GPUCODE
 
       protected:
-#ifdef HLTCA_GPU_PREFETCHDATA
-        uint4 fData[2][ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4]; // temp memory
-               AliHLTTPCCARow fRow[2]; // rows
-#else
       AliHLTTPCCARow fRows[HLTCA_ROW_COUNT]; // rows
-#endif //HLTCA_GPU_PREFETCHDATA
-      int fNextTrackletFirst; //! to be commented by D.Rohr
-      int fNextTrackletCount; //! to be commented by D.Rohr
-      int fNextTrackletNoDummy; //! to be commented by D.Rohr
-      int fNextTrackletStupidDummy; //! to be commented by D.Rohr
-      int fNextTrackletFirstRun; //! to be commented by D.Rohr
-      int fNTracklets; // n tracklets
-      int fSliceDone; //! to be commented by D.Rohr
+      int fNextTrackletFirst; //First tracklet to be processed by CUDA block during next iteration
+      int fNextTrackletCount; //Number of Tracklets to be processed by CUDA block during next iteration
+      int fNextTrackletStupidDummy; //Shared Dummy variable to access
+      int fNextTrackletFirstRun; //First run for dynamic scheduler?
+      int fNTracklets; // Total number of tracklets
 
       int fStartRows[HLTCA_GPU_THREAD_COUNT / HLTCA_GPU_WARP_SIZE + 1]; // start rows
       int fEndRows[HLTCA_GPU_THREAD_COUNT / HLTCA_GPU_WARP_SIZE + 1]; // end rows
@@ -64,7 +57,7 @@ class AliHLTTPCCATrackletConstructor
       int fMaxSync; //! to be commented by D.Rohr
 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
 
-      int fTrackletStoreCount[2][HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1];//! to be commented by D.Rohr
+      int fTrackletStoreCount[2][HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1];//Number of tracklets to store in tracklet pool for rescheduling
     };
 
     class  AliHLTTPCCAThreadMemory
@@ -96,6 +89,7 @@ class AliHLTTPCCATrackletConstructor
         float fLastZ; // Z of the last fitted cluster
     };
 
+       //Structure to store track parameters and temporary thread variables in global memory when rescheduling
        struct AliHLTTPCCAGPUTempMemory
        {
          AliHLTTPCCAThreadMemory fThreadMem;// thread memory
@@ -104,8 +98,6 @@ class AliHLTTPCCATrackletConstructor
 
        GPUd() static void InitTracklet ( AliHLTTPCCATrackParam &tParam );
 
-    GPUd() static void ReadData( int iThread, AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, int iRow );
-
     GPUd() static void UpdateTracklet
     ( int nBlocks, int nThreads, int iBlock, int iThread,
       AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam, int iRow );
@@ -125,15 +117,6 @@ class AliHLTTPCCATrackletConstructor
 #endif //HLTCA_GPUCODE
 
     GPUd() static bool SAVE() { return 1; }
-
-#if defined(HLTCA_GPUCODE)
-    //GPUhd() inline int NMemThreads() { return 128; }
-#define TRACKLET_CONSTRUCTOR_NMEMTHREDS HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS
-#else
-    //GPUhd() inline int NMemThreads() { return 1; }
-#define TRACKLET_CONSTRUCTOR_NMEMTHREDS 1
-#endif //!HLTCA_GPUCODE
-
 };
 
 #endif //ALIHLTTPCCATRACKLETCONSTRUCTOR_H
index 5fe6692ff86bf05e00eaa66d06b929a373d20d5a..0d8912a5042d7212a2a5094b4370cc4dc8d1c077 100644 (file)
@@ -61,7 +61,7 @@ template<typename T, unsigned int Alignment> static inline T *AssignMemory( char
 template<typename T, unsigned int Alignment> GPUhd() static T *_assignMemory( char *&mem, unsigned int size )
 {
   STATIC_ASSERT( ( Alignment & ( Alignment - 1 ) ) == 0, Alignment_needs_to_be_a_multiple_of_2 );
-  AlignTo<Alignment>( mem );
+  AlignTo<Alignment < sizeof( HLTCA_GPU_ROWALIGNMENT ) ? sizeof( HLTCA_GPU_ROWALIGNMENT ) : Alignment>( mem );
   T *r = reinterpret_cast<T *>( mem );
   mem += size * sizeof( T );
   return r;
@@ -69,7 +69,7 @@ template<typename T, unsigned int Alignment> GPUhd() static T *_assignMemory( ch
 
 template<typename T> GPUhd() static inline void AssignMemory( T *&dst, char *&mem, int count )
 {
-  dst = _assignMemory < T, ( sizeof( T ) & ( sizeof( T ) - 1 ) ) == 0 && sizeof( T ) <= 16 ? sizeof( T ) : sizeof( void * ) > ( mem, count );
+       dst = _assignMemory < T, ( sizeof( T ) & ( sizeof( T ) - 1 ) ) == 0 && sizeof( T ) <= 16 ? sizeof( T ) : sizeof( void * ) > ( mem, count );
 }
 
 #endif // MEMORYASSIGNMENTHELPERS_H