HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu

   1 // **************************************************************************
   2 // This file is property of and copyright by the ALICE HLT Project          *
   3 // ALICE Experiment at CERN, All rights reserved.                           *
   4 //                                                                          *
   5 // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
   6 //                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
   7 //                                      David Rohr <drohr@kip.uni-heidelberg.de>                                *
   8 //                  for The ALICE HLT Project.                              *
   9 //                                                                          *
  10 // Permission to use, copy, modify and distribute this software and its     *
  11 // documentation strictly for non-commercial purposes is hereby granted     *
  12 // without fee, provided that the above copyright notice appears in all     *
  13 // copies and that both the copyright notice and this permission notice     *
  14 // appear in the supporting documentation. The authors make no claims       *
  15 // about the suitability of this software for any purpose. It is            *
  16 // provided "as is" without express or implied warranty.                    *
  17 //                                                                          *
  18 //***************************************************************************
  19
  20 #define HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT 36
  21 #define FERMI
  22 #include "AliHLTTPCCAGPUTrackerNVCC.h"
  23
  24 #ifdef HLTCA_GPUCODE
  25 #include <cuda.h>
  26 #include <sm_11_atomic_functions.h>
  27 #include <sm_12_atomic_functions.h>
  28 #endif
  29
  30 #ifdef R__WIN32
  31 #else
  32 #include <sys/syscall.h>
  33 #include <semaphore.h>
  34 #include <fcntl.h>
  35 #endif
  36 #include "AliHLTTPCCADef.h"
  37 #include "AliHLTTPCCAGPUConfig.h"
  38
  39 #if defined(HLTCA_STANDALONE) & !defined(_WIN32)
  40 #include <sched.h>
  41 #endif
  42
  43 #include <iostream>
  44 #include <fstream>
  45
  46 //Disable assertions since they produce errors in GPU Code
  47 #ifdef assert
  48 #undef assert
  49 #endif
  50 #define assert(param)
  51
  52 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
  53 #ifdef HLTCA_GPU_TEXTURE_FETCH
  54 texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
  55 texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
  56 texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
  57 #endif
  58
  59 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
  60 #include "AliHLTTPCCATrackParam.cxx"
  61 #include "AliHLTTPCCATrack.cxx"
  62
  63 #include "AliHLTTPCCAHitArea.cxx"
  64 #include "AliHLTTPCCAGrid.cxx"
  65 #include "AliHLTTPCCARow.cxx"
  66 #include "AliHLTTPCCAParam.cxx"
  67 #include "AliHLTTPCCATracker.cxx"
  68
  69 #include "AliHLTTPCCAProcess.h"
  70
  71 #include "AliHLTTPCCATrackletSelector.cxx"
  72 #include "AliHLTTPCCANeighboursFinder.cxx"
  73 #include "AliHLTTPCCANeighboursCleaner.cxx"
  74 #include "AliHLTTPCCAStartHitsFinder.cxx"
  75 #include "AliHLTTPCCAStartHitsSorter.cxx"
  76 #include "AliHLTTPCCATrackletConstructor.cxx"
  77
  78 #ifdef HLTCA_GPU_MERGER
  79 #include "AliHLTTPCGMMerger.h"
  80 #include "AliHLTTPCGMTrackParam.cxx"
  81 #endif
  82
  83 #include "MemoryAssignmentHelpers.h"
  84
  85 #ifndef HLTCA_STANDALONE
  86 #include "AliHLTDefinitions.h"
  87 #include "AliHLTSystem.h"
  88 #endif
  89
  90 #define RANDOM_ERROR
  91 //#define RANDOM_ERROR || rand() % 500 == 1
  92
  93 ClassImp( AliHLTTPCCAGPUTrackerNVCC )
  94
  95 int AliHLTTPCCAGPUTrackerNVCC::GlobalTracking(int iSlice, int threadId, AliHLTTPCCAGPUTrackerNVCC::helperParam* hParam)
  96 {
  97         if (fDebugLevel >= 3) printf("GPU Tracker running Global Tracking for slice %d on thread %d\n", iSlice, threadId);
  98
  99         int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
 100         int sliceRight = (iSlice + 1) % (fgkNSlices / 2);
 101         if (iSlice >= fgkNSlices / 2)
 102         {
 103                 sliceLeft += fgkNSlices / 2;
 104                 sliceRight += fgkNSlices / 2;
 105         }
 106         while (fSliceOutputReady < iSlice || fSliceOutputReady < sliceLeft || fSliceOutputReady < sliceRight)
 107         {
 108                 if (hParam != NULL && hParam->fReset) return(1);
 109         }
 110
 111         pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
 112         pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
 113         fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS);
 114         pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
 115         pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
 116
 117         fSliceLeftGlobalReady[sliceLeft] = 1;
 118         fSliceRightGlobalReady[sliceRight] = 1;
 119         if (fDebugLevel >= 3) printf("GPU Tracker finished Global Tracking for slice %d on thread %d\n", iSlice, threadId);
 120         return(0);
 121 }
 122
 123 void* AliHLTTPCCAGPUTrackerNVCC::helperWrapper(void* arg)
 124 {
 125         AliHLTTPCCAGPUTrackerNVCC::helperParam* par = (AliHLTTPCCAGPUTrackerNVCC::helperParam*) arg;
 126         AliHLTTPCCAGPUTrackerNVCC* cls = par->fCls;
 127
 128         AliHLTTPCCATracker* tmpTracker = new AliHLTTPCCATracker;
 129
 130 #ifdef HLTCA_STANDALONE
 131         if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d starting", par->fNum);
 132 #endif
 133
 134 #if defined(HLTCA_STANDALONE) & !defined(_WIN32)
 135         cpu_set_t mask;
 136         CPU_ZERO(&mask);
 137         CPU_SET(par->fNum * 2 + 2, &mask);
 138         //sched_setaffinity(0, sizeof(mask), &mask);
 139 #endif
 140
 141         while(pthread_mutex_lock(&((pthread_mutex_t*) par->fMutex)[0]) == 0 && par->fTerminate == false)
 142         {
 143                 if (par->CPUTracker)
 144                 {
 145                         for (int i = 0;i < cls->fNSlicesPerCPUTracker;i++)
 146                         {
 147                                 int myISlice = cls->fSliceCount - cls->fNCPUTrackers * cls->fNSlicesPerCPUTracker + (par->fNum - cls->fNHelperThreads) * cls->fNSlicesPerCPUTracker + i;
 148 #ifdef HLTCA_STANDALONE
 149                                 if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Doing full CPU tracking, Slice %d", par->fNum, myISlice);
 150 #endif
 151                                 if (myISlice >= 0)
 152                                 {
 153                                         tmpTracker->Initialize(cls->fSlaveTrackers[par->fFirstSlice + myISlice].Param());
 154                                         tmpTracker->ReadEvent(&par->pClusterData[myISlice]);
 155                                         tmpTracker->DoTracking();
 156                                         tmpTracker->SetOutput(&par->pOutput[myISlice]);
 157                                         pthread_mutex_lock((pthread_mutex_t*) cls->fHelperMemMutex);
 158                                         tmpTracker->WriteOutputPrepare();
 159                                         pthread_mutex_unlock((pthread_mutex_t*) cls->fHelperMemMutex);
 160                                         tmpTracker->WriteOutput();
 161
 162                                         /*cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUSliceDataMemory((char*) new uint4[HLTCA_GPU_SLICE_DATA_MEMORY/sizeof(uint4)], (char*) new uint4[HLTCA_GPU_ROWS_MEMORY/sizeof(uint4)]);
 163                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].ReadEvent(&par->pClusterData[myISlice]);
 164                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracklets(HLTCA_GPU_MAX_TRACKLETS);
 165                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersHits(par->pClusterData[myISlice].NumberOfClusters());
 166                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracks(HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
 167                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, cls->fConstructorBlockCount);
 168                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemorySize()/sizeof( uint4 ) + 100]), par->pClusterData[myISlice].NumberOfClusters());
 169                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTracksMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemorySize()/sizeof( uint4 ) + 100]), HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
 170                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].DoTracking();
 171                                         cls->WriteOutput(par->pOutput, par->fFirstSlice, myISlice, par->fNum + 1);
 172                                         delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemory();
 173                                         delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemory();
 174                                         delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemory();*/
 175                                 }
 176 #ifdef HLTCA_STANDALONE
 177                                 if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d", par->fNum, myISlice);
 178 #endif
 179                         }
 180                 }
 181                 else
 182                 {
 183                         int mustRunSlice19 = 0;
 184                         for (int i = par->fNum + 1;i < par->fSliceCount;i += cls->fNHelperThreads + 1)
 185                         {
 186                                 //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Running, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
 187                                 if (par->fPhase)
 188                                 {
 189                                         if (cls->fUseGlobalTracking)
 190                                         {
 191                                                 int realSlice = i + 1;
 192                                                 if (realSlice % (fgkNSlices / 2) < 1) realSlice -= fgkNSlices / 2;
 193
 194                                                 if (realSlice % (fgkNSlices / 2) != 1)
 195                                                 {
 196                                                         cls->GlobalTracking(realSlice, par->fNum + 1, par);
 197                                                 }
 198
 199                                                 if (realSlice == 19)
 200                                                 {
 201                                                         mustRunSlice19 = 1;
 202                                                 }
 203                                                 else
 204                                                 {
 205                                                         while (cls->fSliceLeftGlobalReady[realSlice] == 0 || cls->fSliceRightGlobalReady[realSlice] == 0)
 206                                                         {
 207                                                                 if (par->fReset) goto ResetHelperThread;
 208                                                         }
 209                                                         cls->WriteOutput(par->pOutput, par->fFirstSlice, realSlice, par->fNum + 1);
 210                                                 }
 211                                         }
 212                                         else
 213                                         {
 214                                                 while (cls->fSliceOutputReady < i)
 215                                                 {
 216                                                         if (par->fReset) goto ResetHelperThread;
 217                                                 }
 218                                                 cls->WriteOutput(par->pOutput, par->fFirstSlice, i, par->fNum + 1);
 219                                         }
 220                                 }
 221                                 else
 222                                 {
 223                                         cls->ReadEvent(par->pClusterData, par->fFirstSlice, i, par->fNum + 1);
 224                                         par->fDone = i + 1;
 225                                 }
 226                                 //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
 227                         }
 228                         if (mustRunSlice19)
 229                         {
 230                                 while (cls->fSliceLeftGlobalReady[19] == 0 || cls->fSliceRightGlobalReady[19] == 0)
 231                                 {
 232                                         if (par->fReset) goto ResetHelperThread;
 233                                 }
 234                                 cls->WriteOutput(par->pOutput, par->fFirstSlice, 19, par->fNum + 1);
 235                         }
 236                 }
 237 ResetHelperThread:
 238                 cls->ResetThisHelperThread(par);
 239         }
 240 #ifdef HLTCA_STANDALONE
 241         if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d terminating", par->fNum);
 242 #endif
 243         delete tmpTracker;
 244         pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
 245         pthread_exit(NULL);
 246         return(NULL);
 247 }
 248
 249 void AliHLTTPCCAGPUTrackerNVCC::ResetThisHelperThread(AliHLTTPCCAGPUTrackerNVCC::helperParam* par)
 250 {
 251         if (par->fReset) HLTImportant("GPU Helper Thread %d reseting", par->fNum);
 252         par->fReset = false;
 253         pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
 254 }
 255
 256 #define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
 257
 258 AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :
 259 fGpuTracker(NULL),
 260 fGPUMemory(NULL),
 261 fHostLockedMemory(NULL),
 262 fGPUMergerMemory(NULL),
 263 fGPUMergerHostMemory(NULL),
 264 fGPUMergerMaxMemory(0),
 265 fDebugLevel(0),
 266 fDebugMask(0xFFFFFFFF),
 267 fOutFile(NULL),
 268 fGPUMemSize(0),
 269 fpCudaStreams(NULL),
 270 fSliceCount(HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT),
 271 fCudaDevice(0),
 272 fOutputControl(NULL),
 273 fThreadId(0),
 274 fCudaInitialized(0),
 275 fPPMode(0),
 276 fSelfheal(0),
 277 fConstructorBlockCount(30),
 278 selectorBlockCount(30),
 279 fCudaContext(NULL),
 280 fNHelperThreads(HLTCA_GPU_DEFAULT_HELPER_THREADS),
 281 fHelperParams(NULL),
 282 fHelperMemMutex(NULL),
 283 fSliceOutputReady(0),
 284 fSliceGlobalMutexes(NULL),
 285 fNCPUTrackers(0),
 286 fNSlicesPerCPUTracker(0),
 287 fGlobalTracking(0),
 288 fUseGlobalTracking(0),
 289 fNSlaveThreads(0)
 290 {
 291         fCudaContext = (void*) new CUcontext;
 292 };
 293
 294 AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC()
 295 {
 296         delete (CUcontext*) fCudaContext;
 297 };
 298
 299 void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)
 300 {
 301         //Release the global named semaphore that locks GPU Initialization
 302 #ifdef R__WIN32
 303         HANDLE* h = (HANDLE*) sem;
 304         ReleaseSemaphore(*h, 1, NULL);
 305         CloseHandle(*h);
 306         delete h;
 307 #else
 308         sem_t* pSem = (sem_t*) sem;
 309         sem_post(pSem);
 310         sem_unlink(SemLockName);
 311 #endif
 312 }
 313
 314 int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)
 315 {
 316         //Check constants for correct memory sizes
 317         if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
 318         {
 319                 HLTError("Insufficiant Tracker Object Memory for %d slices", sliceCount);
 320                 return(1);
 321         }
 322
 323         if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
 324         {
 325                 HLTError("Insufficiant Common Memory");
 326                 return(1);
 327         }
 328
 329         if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
 330         {
 331                 HLTError("Insufficiant Row Memory");
 332                 return(1);
 333         }
 334
 335         if (fDebugLevel >= 3)
 336         {
 337                 HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", (int) sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, (int) (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize()), HLTCA_GPU_COMMON_MEMORY, (int) (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow)), HLTCA_GPU_ROWS_MEMORY);
 338         }
 339         return(0);
 340 }
 341
 342 int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
 343 {
 344         //Find best CUDA device, initialize and allocate memory
 345
 346 #if defined(HLTCA_STANDALONE) & !defined(_WIN32)
 347         cpu_set_t mask;
 348         CPU_ZERO(&mask);
 349         CPU_SET(0, &mask);
 350         //sched_setaffinity(0, sizeof(mask), &mask);
 351 #endif
 352
 353         if (sliceCount == -1) sliceCount = fSliceCount;
 354
 355         if (CheckMemorySizes(sliceCount)) return(1);
 356
 357 #ifdef R__WIN32
 358         HANDLE* semLock = new HANDLE;
 359         *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
 360         if (*semLock == NULL)
 361         {
 362                 HLTError("Error creating GPUInit Semaphore");
 363                 return(1);
 364         }
 365         WaitForSingleObject(*semLock, INFINITE);
 366 #else
 367         sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
 368         if (semLock == SEM_FAILED)
 369         {
 370                 HLTError("Error creating GPUInit Semaphore");
 371                 return(1);
 372         }
 373         timespec semtime;
 374         clock_gettime(CLOCK_REALTIME, &semtime);
 375         semtime.tv_sec += 10;
 376         while (sem_timedwait(semLock, &semtime) != 0)
 377         {
 378                 HLTError("Global Lock for GPU initialisation was not released for 10 seconds, assuming another thread died");
 379                 HLTWarning("Resetting the global lock");
 380                 sem_post(semLock);
 381         }
 382 #endif
 383
 384         fThreadId = GetThread();
 385
 386         cudaDeviceProp fCudaDeviceProp;
 387
 388         fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
 389
 390 #ifdef HLTCA_GPU_MERGER
 391         fGPUMergerMaxMemory = 2000000 * 5 * sizeof(float);
 392         fGPUMemSize += fGPUMergerMaxMemory;
 393 #endif
 394
 395 #ifndef CUDA_DEVICE_EMULATION
 396         int count, bestDevice = -1;
 397         long long int bestDeviceSpeed = 0, deviceSpeed;
 398         if (CudaFailedMsg(cudaGetDeviceCount(&count)))
 399         {
 400                 HLTError("Error getting CUDA Device Count");
 401                 ReleaseGlobalLock(semLock);
 402                 return(1);
 403         }
 404         if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
 405 #ifdef FERMI
 406         const int reqVerMaj = 2;
 407         const int reqVerMin = 0;
 408 #else
 409         const int reqVerMaj = 1;
 410         const int reqVerMin = 2;
 411 #endif
 412         for (int i = 0;i < count;i++)
 413         {
 414                 if (fDebugLevel >= 4) printf("Examining device %d\n", i);
 415 #if CUDA_VERSION > 3010
 416                 size_t free, total;
 417 #else
 418                 unsigned int free, total;
 419 #endif
 420                 cuInit(0);
 421                 CUdevice tmpDevice;
 422                 cuDeviceGet(&tmpDevice, i);
 423                 CUcontext tmpContext;
 424                 cuCtxCreate(&tmpContext, 0, tmpDevice);
 425                 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
 426                 cuCtxDestroy(tmpContext);
 427                 if (fDebugLevel >= 4) printf("Obtained current memory usage for device %d\n", i);
 428                 if (CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;
 429                 if (fDebugLevel >= 4) printf("Obtained device properties for device %d\n", i);
 430                 int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < reqVerMaj || (fCudaDeviceProp.major == reqVerMaj && fCudaDeviceProp.minor < reqVerMin)) && free >= fGPUMemSize + 100 * 1024 + 1024;
 431 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
 432                 //if (sliceCount > fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER) deviceOK = 0;
 433 #endif
 434
 435                 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %lld / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, (long long int) free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
 436                 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free * (long long int) fCudaDeviceProp.major * (long long int) fCudaDeviceProp.major;
 437                 if (deviceOK && deviceSpeed > bestDeviceSpeed)
 438                 {
 439                         bestDevice = i;
 440                         bestDeviceSpeed = deviceSpeed;
 441                 }
 442         }
 443         if (bestDevice == -1)
 444         {
 445                 HLTWarning("No %sCUDA Device available, aborting CUDA Initialisation", count ? "appropriate " : "");
 446                 HLTInfo("Requiring Revision %d.%d, Mem: %lld, Multiprocessors: %d", reqVerMaj, reqVerMin, fGPUMemSize + 100 * 1024 * 1024, sliceCount);
 447                 ReleaseGlobalLock(semLock);
 448                 return(1);
 449         }
 450
 451         if (forceDeviceID == -1)
 452                 fCudaDevice = bestDevice;
 453         else
 454                 fCudaDevice = forceDeviceID;
 455 #else
 456         fCudaDevice = 0;
 457 #endif
 458
 459         cudaGetDeviceProperties(&fCudaDeviceProp ,fCudaDevice );
 460
 461         if (fDebugLevel >= 1)
 462         {
 463                 HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
 464                 HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);
 465                 HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);
 466                 HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
 467                 HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
 468                 HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);
 469                 HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
 470                 HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
 471                 HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
 472                 HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);
 473                 HLTInfo("major = %d", fCudaDeviceProp.major);
 474                 HLTInfo("minor = %d", fCudaDeviceProp.minor);
 475                 HLTInfo("clockRate = %d", fCudaDeviceProp.clockRate);
 476                 HLTInfo("memoryClockRate = %d", fCudaDeviceProp.memoryClockRate);
 477                 HLTInfo("multiProcessorCount = %d", fCudaDeviceProp.multiProcessorCount);
 478                 HLTInfo("textureAlignment = %lld", (unsigned long long int) fCudaDeviceProp.textureAlignment);
 479         }
 480         fConstructorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER;
 481         selectorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_SELECTOR_MULTIPLIER;
 482
 483         if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
 484         {
 485                 HLTError( "Unsupported CUDA Device" );
 486                 ReleaseGlobalLock(semLock);
 487                 return(1);
 488         }
 489
 490         if (cuCtxCreate((CUcontext*) fCudaContext, CU_CTX_SCHED_AUTO, fCudaDevice) != CUDA_SUCCESS)
 491         {
 492                 HLTError("Could not set CUDA Device!");
 493                 ReleaseGlobalLock(semLock);
 494                 return(1);
 495         }
 496
 497         if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
 498         {
 499                 HLTError("CUDA Memory Allocation Error");
 500                 cudaThreadExit();
 501                 ReleaseGlobalLock(semLock);
 502                 return(1);
 503         }
 504         fGPUMergerMemory = ((char*) fGPUMemory) + fGPUMemSize - fGPUMergerMaxMemory;
 505         ReleaseGlobalLock(semLock);
 506         if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
 507         int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
 508 #ifdef HLTCA_GPU_MERGER
 509         hostMemSize += fGPUMergerMaxMemory;
 510 #endif
 511         if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
 512         {
 513                 cudaFree(fGPUMemory);
 514                 cudaThreadExit();
 515                 HLTError("Error allocating Page Locked Host Memory");
 516                 return(1);
 517         }
 518         if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
 519         fGPUMergerHostMemory = ((char*) fHostLockedMemory) + hostMemSize - fGPUMergerMaxMemory;
 520
 521         if (fDebugLevel >= 1)
 522         {
 523                 CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
 524         }
 525
 526         fSliceCount = sliceCount;
 527         //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
 528         fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
 529
 530         for (int i = 0;i < fgkNSlices;i++)
 531         {
 532                 fSlaveTrackers[i].SetGPUTracker();
 533                 fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
 534                 fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
 535         }
 536
 537         fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
 538         cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 539         for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
 540         {
 541                 if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
 542                 {
 543                         cudaFree(fGPUMemory);
 544                         cudaFreeHost(fHostLockedMemory);
 545                         cudaThreadExit();
 546                         HLTError("Error creating CUDA Stream");
 547                         return(1);
 548                 }
 549         }
 550
 551         if (StartHelperThreads()) return(1);
 552
 553         fHelperMemMutex = malloc(sizeof(pthread_mutex_t));
 554         if (fHelperMemMutex == NULL)
 555         {
 556                 HLTError("Memory allocation error");
 557                 cudaFree(fGPUMemory);
 558                 cudaFreeHost(fHostLockedMemory);
 559                 cudaThreadExit();
 560                 return(1);
 561         }
 562
 563         if (pthread_mutex_init((pthread_mutex_t*) fHelperMemMutex, NULL))
 564         {
 565                 HLTError("Error creating pthread mutex");
 566                 cudaFree(fGPUMemory);
 567                 cudaFreeHost(fHostLockedMemory);
 568                 cudaThreadExit();
 569                 return(1);
 570         }
 571
 572         fSliceGlobalMutexes = malloc(sizeof(pthread_mutex_t) * fgkNSlices);
 573         if (fSliceGlobalMutexes == NULL)
 574         {
 575                 HLTError("Memory allocation error");
 576                 cudaFree(fGPUMemory);
 577                 cudaFreeHost(fHostLockedMemory);
 578                 cudaThreadExit();
 579                 return(1);
 580         }
 581         for (int i = 0;i < fgkNSlices;i++)
 582         {
 583                 if (pthread_mutex_init(&((pthread_mutex_t*) fSliceGlobalMutexes)[i], NULL))
 584                 {
 585                         HLTError("Error creating pthread mutex");
 586                         cudaFree(fGPUMemory);
 587                         cudaFreeHost(fHostLockedMemory);
 588                         cudaThreadExit();
 589                         return(1);
 590                 }
 591         }
 592
 593         cuCtxPopCurrent((CUcontext*) fCudaContext);
 594         fCudaInitialized = 1;
 595         HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", fCudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount);
 596
 597 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
 598         if (fDebugLevel < 2 && 0)
 599         {
 600                 //Do one initial run for Benchmark reasons
 601                 const int useDebugLevel = fDebugLevel;
 602                 fDebugLevel = 0;
 603                 AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
 604
 605                 std::ifstream fin;
 606
 607                 AliHLTTPCCAParam tmpParam;
 608                 AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
 609
 610                 fin.open("events/settings.dump");
 611                 int tmpCount;
 612                 fin >> tmpCount;
 613                 for (int i = 0;i < sliceCount;i++)
 614                 {
 615                         fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
 616                         tmpParam.ReadSettings(fin);
 617                         InitializeSliceParam(i, tmpParam);
 618                 }
 619                 fin.close();
 620
 621                 fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
 622                 for (int i = 0;i < sliceCount;i++)
 623                 {
 624                         tmpCluster[i].StartReading(i, 0);
 625                         tmpCluster[i].ReadEvent(fin);
 626                 }
 627                 fin.close();
 628
 629                 AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
 630                 memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
 631
 632                 Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
 633                 for (int i = 0;i < sliceCount;i++)
 634                 {
 635                         free(tmpOutput[i]);
 636                         tmpOutput[i] = NULL;
 637                         fSlaveTrackers[i].SetOutputControl(NULL);
 638                 }
 639                 delete[] tmpOutput;
 640                 delete[] tmpCluster;
 641                 fDebugLevel = useDebugLevel;
 642         }
 643 #endif
 644
 645         return(0);
 646 }
 647
 648 int AliHLTTPCCAGPUTrackerNVCC::StartHelperThreads()
 649 {
 650         int nThreads = fNHelperThreads + fNCPUTrackers;
 651         if (nThreads)
 652         {
 653                 fHelperParams = new helperParam[nThreads];
 654                 if (fHelperParams == NULL)
 655                 {
 656                         HLTError("Memory allocation error");
 657                         cudaFree(fGPUMemory);
 658                         cudaFreeHost(fHostLockedMemory);
 659                         cudaThreadExit();
 660                         return(1);
 661                 }
 662                 for (int i = 0;i < nThreads;i++)
 663                 {
 664                         fHelperParams[i].fCls = this;
 665                         fHelperParams[i].fTerminate = false;
 666                         fHelperParams[i].fReset = false;
 667                         fHelperParams[i].fNum = i;
 668                         fHelperParams[i].fMutex = malloc(2 * sizeof(pthread_mutex_t));
 669                         if (fHelperParams[i].fMutex == NULL)
 670                         {
 671                                 HLTError("Memory allocation error");
 672                                 cudaFree(fGPUMemory);
 673                                 cudaFreeHost(fHostLockedMemory);
 674                                 cudaThreadExit();
 675                                 return(1);
 676                         }
 677                         for (int j = 0;j < 2;j++)
 678                         {
 679                                 if (pthread_mutex_init(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j], NULL))
 680                                 {
 681                                         HLTError("Error creating pthread mutex");
 682                                         cudaFree(fGPUMemory);
 683                                         cudaFreeHost(fHostLockedMemory);
 684                                         cudaThreadExit();
 685                                         return(1);
 686                                 }
 687
 688                                 pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
 689                         }
 690                         fHelperParams[i].fThreadId = (void*) malloc(sizeof(pthread_t));
 691
 692                         if (pthread_create((pthread_t*) fHelperParams[i].fThreadId, NULL, helperWrapper, &fHelperParams[i]))
 693                         {
 694                                 HLTError("Error starting slave thread");
 695                                 cudaFree(fGPUMemory);
 696                                 cudaFreeHost(fHostLockedMemory);
 697                                 cudaThreadExit();
 698                         }
 699                 }
 700         }
 701         fNSlaveThreads = nThreads;
 702         return(0);
 703 }
 704
 705 template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)
 706 {
 707         //Macro to align Pointers.
 708         //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
 709         //(As long as every single data structure is <= 1 MB)
 710
 711         size_t adr = (size_t) ptr;
 712         if (adr % alignment)
 713         {
 714                 adr += alignment - (adr % alignment);
 715         }
 716         return((T*) adr);
 717 }
 718
 719 bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsgA(cudaError_t error, const char* file, int line)
 720 {
 721         //Check for CUDA Error and in the case of an error display the corresponding error string
 722         if (error == cudaSuccess) return(false);
 723         HLTWarning("CUDA Error: %d / %s (%s:%d)", error, cudaGetErrorString(error), file, line);
 724         return(true);
 725 }
 726
 727 int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice)
 728 {
 729         //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
 730
 731         if (fDebugLevel == 0) return(0);
 732         cudaError cuErr;
 733         cuErr = cudaGetLastError();
 734         if (cuErr != cudaSuccess)
 735         {
 736                 HLTError("Cuda Error %s while running kernel (%s) (Slice %d; %d/%d)", cudaGetErrorString(cuErr), state, sliceLocal, slice, fgkNSlices);
 737                 return(1);
 738         }
 739         if (CudaFailedMsg(cudaThreadSynchronize()))
 740         {
 741                 HLTError("CUDA Error while synchronizing (%s) (Slice %d; %d/%d)", state, sliceLocal, slice, fgkNSlices);
 742                 return(1);
 743         }
 744         if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
 745         return(0);
 746 }
 747
 748 void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
 749 {
 750         //Set Debug Level and Debug output File if applicable
 751         fDebugLevel = dwLevel;
 752         if (NewOutFile) fOutFile = NewOutFile;
 753 }
 754
 755 int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int OptionValue)
 756 {
 757         //Set a specific GPU Tracker Option
 758         if (strcmp(OptionName, "PPMode") == 0)
 759         {
 760                 fPPMode = OptionValue;
 761         }
 762         else if (strcmp(OptionName, "DebugMask") == 0)
 763         {
 764                 fDebugMask = OptionValue;
 765         }
 766         else if (strcmp(OptionName, "HelperThreads") == 0)
 767         {
 768                 fNHelperThreads = OptionValue;
 769         }
 770         else if (strcmp(OptionName, "CPUTrackers") == 0)
 771         {
 772                 fNCPUTrackers = OptionValue;
 773         }
 774         else if (strcmp(OptionName, "SlicesPerCPUTracker") == 0)
 775         {
 776                 fNSlicesPerCPUTracker = OptionValue;
 777         }
 778         else if (strcmp(OptionName, "GlobalTracking") == 0)
 779         {
 780                 fGlobalTracking = OptionValue;
 781         }
 782         else
 783         {
 784                 HLTError("Unknown Option: %s", OptionName);
 785                 return(1);
 786         }
 787
 788         if (fNHelperThreads + fNCPUTrackers > fNSlaveThreads && fCudaInitialized)
 789         {
 790                 HLTInfo("Insufficient Slave Threads available (%d), creating additional Slave Threads (%d+%d)\n", fNSlaveThreads, fNHelperThreads, fNCPUTrackers);
 791                 StopHelperThreads();
 792                 StartHelperThreads();
 793         }
 794
 795         return(0);
 796 }
 797
 798 #ifdef HLTCA_STANDALONE
 799 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)
 800 {
 801         //Run Performance Query for timer i of slice iSlice
 802         if (fDebugLevel >= 1)
 803         {
 804                 AliHLTTPCCATracker::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
 805         }
 806 }
 807 #else
 808 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
 809 #endif
 810
 811 #if defined(BITWISE_COMPATIBLE_DEBUG_OUTPUT) || defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER)
 812 void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker*, int, bool) {}
 813 #else
 814 void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
 815 {
 816         //Dump Rowblocks to File
 817         if (fDebugLevel >= 4)
 818         {
 819                 *fOutFile << "RowBlock Tracklets (Slice" << tracker[iSlice].Param().ISlice() << " (" << iSlice << " of reco))";
 820                 *fOutFile << " after Tracklet Reconstruction";
 821                 *fOutFile << std::endl;
 822
 823                 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
 824                 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
 825                 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * fConstructorBlockCount);
 826                 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
 827                 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
 828                 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * fConstructorBlockCount, cudaMemcpyDeviceToHost));
 829                 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 830
 831                 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
 832                 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
 833                 {
 834                         *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
 835                                 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
 836                         for (int j = 0;j < rowBlockPos[i].x;j++)
 837                         {
 838                                 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
 839                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
 840 #ifdef HLTCA_GPU_SCHED_FIXED_START
 841                                 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
 842                                 {
 843                                         HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
 844                                 }
 845 #endif //HLTCA_GPU_SCHED_FIXED_START
 846                                 k++;
 847                                 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
 848                                 {
 849                                         HLTError("Error, -1 Tracklet found");
 850                                 }
 851                         }
 852                         *fOutFile << std::endl << "Phase 2: ";
 853                         for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
 854                         {
 855                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
 856                         }
 857                         *fOutFile << std::endl;
 858                 }
 859
 860                 if (check)
 861                 {
 862                         *fOutFile << "Starting Threads: (Slice" << tracker[iSlice].Param().ISlice() << ", First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
 863                         for (int i = 0;i < fConstructorBlockCount;i++)
 864                         {
 865                                 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
 866                         }
 867                 }
 868
 869                 free(rowBlockPos);
 870                 free(rowBlockTracklets);
 871                 free(blockStartingTracklet);
 872         }
 873 }
 874 #endif
 875
 876 __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
 877 {
 878         //Initialize GPU RowBlocks and HitWeights
 879         int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
 880         const int stride = blockDim.x * gridDim.x;
 881         int4 i0;
 882         i0.x = i0.y = i0.z = i0.w = 0;
 883 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
 884         int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
 885         int4 i1;
 886         i1.x = i1.y = i1.z = i1.w = -1;
 887         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
 888                 RowBlockPos[i] = i0;
 889         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
 890                 rowBlockTracklets4[i] = i1;
 891 #endif
 892         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
 893                 sliceDataHitWeights4[i] = i0;
 894 }
 895
 896 int AliHLTTPCCAGPUTrackerNVCC::SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
 897 {
 898         if (!fSelfheal)
 899         {
 900                 cuCtxPopCurrent((CUcontext*) fCudaContext);
 901                 return(1);
 902         }
 903         static bool selfHealing = false;
 904         if (selfHealing)
 905         {
 906                 HLTError("Selfhealing failed, giving up");
 907                 cuCtxPopCurrent((CUcontext*) fCudaContext);
 908                 return(1);
 909         }
 910         else
 911         {
 912                 HLTError("Unsolvable CUDA error occured, trying to reinitialize GPU");
 913         }
 914         selfHealing = true;
 915         ExitGPU();
 916         if (InitGPU(fSliceCount, fCudaDevice))
 917         {
 918                 HLTError("Could not reinitialize CUDA device, disabling GPU tracker");
 919                 ExitGPU();
 920                 return(1);
 921         }
 922         HLTInfo("GPU tracker successfully reinitialized, restarting tracking");
 923         int retVal = Reconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal);
 924         selfHealing = false;
 925         return(retVal);
 926 }
 927
 928 void AliHLTTPCCAGPUTrackerNVCC::ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId)
 929 {
 930         fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
 931 #ifdef HLTCA_GPU_TIME_PROFILE
 932         unsigned long long int a, b;
 933         AliHLTTPCCATracker::StandaloneQueryTime(&a);
 934 #endif
 935         fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
 936 #ifdef HLTCA_GPU_TIME_PROFILE
 937         AliHLTTPCCATracker::StandaloneQueryTime(&b);
 938         printf("Read %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
 939 #endif
 940 }
 941
 942 void AliHLTTPCCAGPUTrackerNVCC::WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId)
 943 {
 944         if (fDebugLevel >= 3) printf("GPU Tracker running WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);
 945         fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
 946 #ifdef HLTCA_GPU_TIME_PROFILE
 947         unsigned long long int a, b;
 948         AliHLTTPCCATracker::StandaloneQueryTime(&a);
 949 #endif
 950         if (fNHelperThreads) pthread_mutex_lock((pthread_mutex_t*) fHelperMemMutex);
 951         fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare();
 952         if (fNHelperThreads) pthread_mutex_unlock((pthread_mutex_t*) fHelperMemMutex);
 953         fSlaveTrackers[firstSlice + iSlice].WriteOutput();
 954 #ifdef HLTCA_GPU_TIME_PROFILE
 955         AliHLTTPCCATracker::StandaloneQueryTime(&b);
 956         printf("Write %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
 957 #endif
 958         if (fDebugLevel >= 3) printf("GPU Tracker finished WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);
 959 }
 960
 961 int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
 962 {
 963         //Primary reconstruction function
 964
 965         cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 966
 967         if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
 968
 969         if (!fCudaInitialized)
 970         {
 971                 HLTError("GPUTracker not initialized");
 972                 return(1);
 973         }
 974         if (sliceCountLocal > fSliceCount)
 975         {
 976                 HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
 977                 return(1);
 978         }
 979         if (fThreadId != GetThread())
 980         {
 981                 HLTWarning("CUDA thread changed, migrating context, Previous Thread: %d, New Thread: %d", fThreadId, GetThread());
 982                 fThreadId = GetThread();
 983         }
 984
 985         if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
 986
 987         if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
 988         {
 989                 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
 990                 return(1);
 991         }
 992
 993         cuCtxPushCurrent(*((CUcontext*) fCudaContext));
 994         if (fPPMode)
 995         {
 996                 int retVal = ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal);
 997                 cuCtxPopCurrent((CUcontext*) fCudaContext);
 998                 return(retVal);
 999         }
1000
1001         for (int i = fNHelperThreads;i < fNCPUTrackers + fNHelperThreads;i++)
1002         {
1003                 fHelperParams[i].CPUTracker = 1;
1004                 fHelperParams[i].pClusterData = pClusterData;
1005                 fHelperParams[i].pOutput = pOutput;
1006                 fHelperParams[i].fSliceCount = sliceCountLocal;
1007                 fHelperParams[i].fFirstSlice = firstSlice;
1008                 pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
1009         }
1010         sliceCountLocal -= fNCPUTrackers * fNSlicesPerCPUTracker;
1011         if (sliceCountLocal < 0) sliceCountLocal = 0;
1012
1013         fUseGlobalTracking = fGlobalTracking && sliceCountLocal == fgkNSlices;
1014
1015         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
1016
1017         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
1018
1019 #ifdef HLTCA_GPU_TIME_PROFILE
1020         AliHLTTPCCATracker::StandaloneQueryFreq(&fProfTimeC);
1021         AliHLTTPCCATracker::StandaloneQueryTime(&fProfTimeD);
1022 #endif
1023
1024         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1025         {
1026                 //Make this a GPU Tracker
1027                 fGpuTracker[iSlice].SetGPUTracker();
1028                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
1029                 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
1030                 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
1031
1032                 //Set Pointers to GPU Memory
1033                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
1034
1035                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
1036                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
1037                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
1038
1039                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
1040                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
1041                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
1042
1043                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
1044                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1045                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
1046
1047                 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY RANDOM_ERROR)
1048                 {
1049                         HLTError("Insufficiant Track Memory");
1050                         cudaThreadSynchronize();
1051                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1052                         ResetHelperThreads(0);
1053                         return(1);
1054                 }
1055
1056                 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY RANDOM_ERROR)
1057                 {
1058                         HLTError("Insufficiant Global Memory");
1059                         cudaThreadSynchronize();
1060                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1061                         ResetHelperThreads(0);
1062                         return(1);
1063                 }
1064
1065                 if (fDebugLevel >= 3)
1066                 {
1067                         HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", (int) (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice)), HLTCA_GPU_GLOBAL_MEMORY, (int) fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY);
1068                 }
1069
1070                 //Initialize Startup Constants
1071                 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
1072                 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
1073                 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
1074                 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = sliceCountLocal > fConstructorBlockCount ? (iSlice < fConstructorBlockCount) : fConstructorBlockCount * (iSlice + 1) / sliceCountLocal - fConstructorBlockCount * (iSlice) / sliceCountLocal;
1075                 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
1076                 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
1077                 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
1078                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
1079                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fNextTracklet = (fConstructorBlockCount / sliceCountLocal + (fConstructorBlockCount % sliceCountLocal > iSlice)) * HLTCA_GPU_THREAD_COUNT;
1080                 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
1081         }
1082
1083 #ifdef HLTCA_GPU_TEXTURE_FETCH
1084         cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
1085         size_t offset;
1086         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
1087         {
1088                 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
1089                 cudaThreadSynchronize();
1090                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1091                 ResetHelperThreads(0);
1092                 return(1);
1093         }
1094         cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
1095         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
1096         {
1097                 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
1098                 cudaThreadSynchronize();
1099                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1100                 ResetHelperThreads(0);
1101                 return(1);
1102         }
1103         cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
1104         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
1105         {
1106                 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
1107                 cudaThreadSynchronize();
1108                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1109                 ResetHelperThreads(0);
1110                 return(1);
1111         }
1112 #endif
1113
1114         //Copy Tracker Object to GPU Memory
1115         if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
1116 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
1117         char* tmpMem;
1118         if (CudaFailedMsg(cudaMalloc(&tmpMem, 100000000)))
1119         {
1120                 HLTError("Error allocating CUDA profile memory");
1121                 cudaThreadSynchronize();
1122                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1123                 ResetHelperThreads(0);
1124                 return(1);
1125         }
1126         fGpuTracker[0].fStageAtSync = tmpMem;
1127         CudaFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));
1128 #endif
1129         CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
1130         if (CUDASync("Initialization (1)", 0, firstSlice) RANDOM_ERROR)
1131         {
1132                 cudaThreadSynchronize();
1133                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1134                 ResetHelperThreads(0);
1135                 return(1);
1136         }
1137
1138         for (int i = 0;i < fNHelperThreads;i++)
1139         {
1140                 fHelperParams[i].CPUTracker = 0;
1141                 fHelperParams[i].fDone = 0;
1142                 fHelperParams[i].fPhase = 0;
1143                 fHelperParams[i].pClusterData = pClusterData;
1144                 fHelperParams[i].fSliceCount = sliceCountLocal;
1145                 fHelperParams[i].fFirstSlice = firstSlice;
1146                 pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
1147         }
1148
1149         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1150         {
1151                 StandalonePerfTime(firstSlice + iSlice, 0);
1152
1153                 //Initialize GPU Slave Tracker
1154                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data (Slice %d)", iSlice);
1155                 if (iSlice % (fNHelperThreads + 1) == 0)
1156                 {
1157                         ReadEvent(pClusterData, firstSlice, iSlice, 0);
1158                 }
1159                 else
1160                 {
1161                         if (fDebugLevel >= 3) HLTInfo("Waiting for helper thread %d", iSlice % (fNHelperThreads + 1) - 1);
1162                         while(fHelperParams[iSlice % (fNHelperThreads + 1) - 1].fDone < iSlice);
1163                 }
1164
1165                 if (fDebugLevel >= 4)
1166                 {
1167 #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
1168                         *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl;
1169 #endif
1170                         if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);
1171                 }
1172
1173                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY RANDOM_ERROR)
1174                 {
1175                         HLTError("Insufficiant Slice Data Memory");
1176                         cudaThreadSynchronize();
1177                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1178                         ResetHelperThreads(1);
1179                         return(1);
1180                 }
1181
1182                 if (fDebugLevel >= 3)
1183                 {
1184                         HLTInfo("GPU Slice Data Memory Used: %d/%d", (int) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY);
1185                 }
1186
1187                 //Initialize temporary memory where needed
1188                 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
1189                 PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
1190                 if (CUDASync("Initialization (2)", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1191                 {
1192                         cudaThreadSynchronize();
1193                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1194                         ResetHelperThreads(1);
1195                         return(1);
1196                 }
1197
1198                 //Copy Data to GPU Global Memory
1199                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
1200                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
1201                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
1202
1203                 if (fDebugLevel >= 4)
1204                 {
1205                         if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
1206                         fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
1207                         fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
1208                 }
1209
1210                 if (CUDASync("Initialization (3)", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1211                 {
1212                         cudaThreadSynchronize();
1213                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1214                         ResetHelperThreads(1);
1215                         return(1);
1216                 }
1217                 StandalonePerfTime(firstSlice + iSlice, 1);
1218
1219                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);
1220                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), HLTCA_GPU_THREAD_COUNT_FINDER, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1221
1222                 if (CUDASync("Neighbours finder", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1223                 {
1224                         cudaThreadSynchronize();
1225                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1226                         ResetHelperThreads(1);
1227                         return(1);
1228                 }
1229
1230                 StandalonePerfTime(firstSlice + iSlice, 2);
1231
1232                 if (fDebugLevel >= 4)
1233                 {
1234                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
1235                         if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
1236                 }
1237
1238                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);
1239                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1240                 if (CUDASync("Neighbours Cleaner", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1241                 {
1242                         cudaThreadSynchronize();
1243                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1244                         ResetHelperThreads(1);
1245                         return(1);
1246                 }
1247
1248                 StandalonePerfTime(firstSlice + iSlice, 3);
1249
1250                 if (fDebugLevel >= 4)
1251                 {
1252                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
1253                         if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
1254                 }
1255
1256                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);
1257                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1258                 if (CUDASync("Start Hits Finder", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1259                 {
1260                         cudaThreadSynchronize();
1261                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1262                         ResetHelperThreads(1);
1263                         return(1);
1264                 }
1265
1266                 StandalonePerfTime(firstSlice + iSlice, 4);
1267
1268                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);
1269                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1270                 if (CUDASync("Start Hits Sorter", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1271                 {
1272                         cudaThreadSynchronize();
1273                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1274                         ResetHelperThreads(1);
1275                         return(1);
1276                 }
1277
1278                 StandalonePerfTime(firstSlice + iSlice, 5);
1279
1280                 if (fDebugLevel >= 2)
1281                 {
1282                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
1283                         if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);
1284                         if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS RANDOM_ERROR)
1285                         {
1286                                 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
1287                                 cudaThreadSynchronize();
1288                                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1289                                 ResetHelperThreads(1);
1290                                 return(1);
1291                         }
1292                 }
1293
1294                 if (fDebugLevel >= 4 && *fSlaveTrackers[firstSlice + iSlice].NTracklets())
1295                 {
1296 #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
1297                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
1298                         if (fDebugMask & 8)
1299                         {
1300                                 *fOutFile << "Temporary ";
1301                                 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
1302                         }
1303                         uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
1304                         CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
1305                         if (fDebugMask & 16)
1306                         {
1307                                 *fOutFile << "Start Hits Sort Vector:" << std::endl;
1308                                 for (int i = 1;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 5;i++)
1309                                 {
1310                                         *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
1311                                 }
1312                         }
1313                         free(tmpMemory);
1314 #endif
1315
1316                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
1317                         if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
1318                 }
1319
1320                 StandalonePerfTime(firstSlice + iSlice, 6);
1321
1322                 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1323         }
1324
1325         for (int i = 0;i < fNHelperThreads;i++)
1326         {
1327                 pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
1328         }
1329
1330         StandalonePerfTime(firstSlice, 7);
1331
1332 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
1333         int nHardCollisions = 0;
1334
1335 RestartTrackletConstructor:
1336         if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
1337         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1338         {
1339                 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
1340                 if (CUDASync("Tracklet Initializer", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1341                 {
1342                         cudaThreadSynchronize();
1343                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1344                         return(1);
1345                 }
1346                 if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice);
1347         }
1348 #endif
1349
1350         if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
1351         AliHLTTPCCATrackletConstructorGPU<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>();
1352         if (CUDASync("Tracklet Constructor", 0, firstSlice) RANDOM_ERROR)
1353         {
1354                 cudaThreadSynchronize();
1355                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1356                 return(1);
1357         }
1358
1359         StandalonePerfTime(firstSlice, 8);
1360
1361         if (fDebugLevel >= 4)
1362         {
1363                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1364                 {
1365                         if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
1366                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
1367                         if (fDebugLevel >= 5)
1368                         {
1369                                 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
1370                         }
1371                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
1372                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
1373                         if (0 && fSlaveTrackers[firstSlice + iSlice].NTracklets() && fSlaveTrackers[firstSlice + iSlice].Tracklet(0).NHits() < 0)
1374                         {
1375                                 cudaThreadSynchronize();
1376                                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1377                                 printf("INTERNAL ERROR\n");
1378                                 return(1);
1379                         }
1380                         if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
1381                 }
1382         }
1383
1384         int runSlices = 0;
1385         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
1386         {
1387                 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
1388                 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
1389                 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
1390                 if (CUDASync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1391                 {
1392                         cudaThreadSynchronize();
1393                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1394                         return(1);
1395                 }
1396         }
1397         StandalonePerfTime(firstSlice, 9);
1398
1399         char *tmpMemoryGlobalTracking = NULL;
1400         fSliceOutputReady = 0;
1401         if (fUseGlobalTracking)
1402         {
1403                 int tmpmemSize = sizeof(AliHLTTPCCATracklet)
1404 #ifdef EXTERN_ROW_HITS
1405                 + HLTCA_ROW_COUNT * sizeof(int)
1406 #endif
1407                 + 16;
1408                 tmpMemoryGlobalTracking = (char*) malloc(tmpmemSize * fgkNSlices);
1409                 for (int i = 0;i < fgkNSlices;i++)
1410                 {
1411                         fSliceLeftGlobalReady[i] = 0;
1412                         fSliceRightGlobalReady[i] = 0;
1413                 }
1414                 memset(fGlobalTrackingDone, 0, fgkNSlices);
1415                 memset(fWriteOutputDone, 0, fgkNSlices);
1416
1417                 for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
1418                 {
1419                         fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(tmpMemoryGlobalTracking + (tmpmemSize * iSlice), 1, fConstructorBlockCount);
1420                 }
1421         }
1422         for (int i = 0;i < fNHelperThreads;i++)
1423         {
1424                 fHelperParams[i].fPhase = 1;
1425                 fHelperParams[i].pOutput = pOutput;
1426                 pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
1427         }
1428
1429         int tmpSlice = 0, tmpSlice2 = 0;
1430         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1431         {
1432                 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
1433
1434                 while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == (cudaError_t) CUDA_SUCCESS))
1435                 {
1436                         if (CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice])) RANDOM_ERROR)
1437                         {
1438                                 ResetHelperThreads(1);
1439                                 cudaThreadSynchronize();
1440                                 return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
1441                         }
1442                         tmpSlice++;
1443                 }
1444
1445                 while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == (cudaError_t) CUDA_SUCCESS)
1446                 {
1447                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
1448                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
1449                         tmpSlice2++;
1450                 }
1451
1452                 if (CudaFailedMsg(cudaStreamSynchronize(cudaStreams[iSlice])) RANDOM_ERROR)
1453                 {
1454                         ResetHelperThreads(1);
1455                         cudaThreadSynchronize();
1456                         return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
1457                 }
1458
1459                 if (fDebugLevel >= 4)
1460                 {
1461                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
1462 #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
1463                         if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
1464 #endif
1465                         if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
1466                 }
1467
1468                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError RANDOM_ERROR)
1469                 {
1470 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
1471                         if ((fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION || fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_WRONG_ROW)&& nHardCollisions++ < 10)
1472                         {
1473                                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION)
1474                                 {
1475                                         HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor (Slice %d)", firstSlice + iSlice);
1476                                 }
1477                                 else
1478                                 {
1479                                         HLTWarning("Tracklet Constructor returned invalid row (Slice %d)", firstSlice + iSlice);
1480                                 }
1481                                 if (fDebugLevel >= 4)
1482                                 {
1483                                         ResetHelperThreads(1);
1484                                         cudaThreadSynchronize();
1485                                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1486                                         return(1);
1487                                 }
1488                                 for (int i = 0;i < sliceCountLocal;i++)
1489                                 {
1490                                         cudaThreadSynchronize();
1491                                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
1492                                         *fSlaveTrackers[firstSlice + i].NTracks() = 0;
1493                                         *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
1494                                         fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
1495                                         CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
1496                                         PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
1497                                 }
1498                                 goto RestartTrackletConstructor;
1499                         }
1500 #endif
1501                         HLTError("GPU Tracker returned Error Code %d in slice %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError, firstSlice + iSlice);
1502                         cudaThreadSynchronize();
1503                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1504                         ResetHelperThreads(1);
1505                         return(1);
1506                 }
1507                 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
1508
1509                 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks;
1510                 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits;
1511                 if (fUseGlobalTracking) fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracklets = 1;
1512
1513                 if (fDebugLevel >= 3) HLTInfo("Data ready for slice %d, helper thread %d", iSlice, iSlice % (fNHelperThreads + 1));
1514                 fSliceOutputReady = iSlice;
1515
1516                 if (fUseGlobalTracking)
1517                 {
1518                         if (iSlice % (fgkNSlices / 2) == 2)
1519                         {
1520                                 int tmpId = iSlice % (fgkNSlices / 2) - 1;
1521                                 if (iSlice >= fgkNSlices / 2) tmpId += fgkNSlices / 2;
1522                                 GlobalTracking(tmpId, 0, NULL);
1523                                 fGlobalTrackingDone[tmpId] = 1;
1524                         }
1525                         for (int tmpSlice3a = 0;tmpSlice3a < iSlice;tmpSlice3a += fNHelperThreads + 1)
1526                         {
1527                                 int tmpSlice3 = tmpSlice3a + 1;
1528                                 if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
1529                                 if (tmpSlice3 >= iSlice) break;
1530
1531                                 int sliceLeft = (tmpSlice3 + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
1532                                 int sliceRight = (tmpSlice3 + 1) % (fgkNSlices / 2);
1533                                 if (tmpSlice3 >= fgkNSlices / 2)
1534                                 {
1535                                         sliceLeft += fgkNSlices / 2;
1536                                         sliceRight += fgkNSlices / 2;
1537                                 }
1538
1539                                 if (tmpSlice3 % (fgkNSlices / 2) != 1 && fGlobalTrackingDone[tmpSlice3] == 0 && sliceLeft < iSlice && sliceRight < iSlice)
1540                                 {
1541                                         GlobalTracking(tmpSlice3, 0, NULL);
1542                                         fGlobalTrackingDone[tmpSlice3] = 1;
1543                                 }
1544
1545                                 if (fWriteOutputDone[tmpSlice3] == 0 && fSliceLeftGlobalReady[tmpSlice3] && fSliceRightGlobalReady[tmpSlice3])
1546                                 {
1547                                         WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
1548                                         fWriteOutputDone[tmpSlice3] = 1;
1549                                 }
1550                         }
1551                 }
1552                 else
1553                 {
1554                         if (iSlice % (fNHelperThreads + 1) == 0)
1555                         {
1556                                 WriteOutput(pOutput, firstSlice, iSlice, 0);
1557                         }
1558                 }
1559
1560                 if (fDebugLevel >= 4)
1561                 {
1562                         delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
1563                         delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
1564                 }
1565         }
1566
1567         if (fUseGlobalTracking)
1568         {
1569                 for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
1570                 {
1571                         int tmpSlice3 = (tmpSlice3a + 1);
1572                         if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
1573                         if (fGlobalTrackingDone[tmpSlice3] == 0) GlobalTracking(tmpSlice3, 0, NULL);
1574                 }
1575                 for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
1576                 {
1577                         int tmpSlice3 = (tmpSlice3a + 1);
1578                         if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
1579                         if (fWriteOutputDone[tmpSlice3] == 0)
1580                         {
1581                                 while (fSliceLeftGlobalReady[tmpSlice3] == 0 || fSliceRightGlobalReady[tmpSlice3] == 0);
1582                                 WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
1583                         }
1584                 }
1585         }
1586
1587         for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
1588         {
1589                 pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
1590         }
1591
1592         if (fUseGlobalTracking)
1593         {
1594                 free(tmpMemoryGlobalTracking);
1595                 if (fDebugLevel >= 3)
1596                 {
1597                         for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
1598                         {
1599                                 printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits);
1600                         }
1601                 }
1602         }
1603
1604         StandalonePerfTime(firstSlice, 10);
1605
1606         if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
1607
1608         /*for (int i = firstSlice;i < firstSlice + sliceCountLocal;i++)
1609         {
1610                 fSlaveTrackers[i].DumpOutput(stdout);
1611         }*/
1612
1613         /*static int runnum = 0;
1614         std::ofstream tmpOut;
1615         char buffer[1024];
1616         sprintf(buffer, "GPUtracks%d.out", runnum++);
1617         tmpOut.open(buffer);
1618         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1619         {
1620                 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(tmpOut);
1621         }
1622         tmpOut.close();*/
1623
1624 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
1625         char* stageAtSync = (char*) malloc(100000000);
1626         CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
1627         cudaFree(fGpuTracker[0].StageAtSync());
1628
1629         FILE* fp = fopen("profile.txt", "w+");
1630         FILE* fp2 = fopen("profile.bmp", "w+b");
1631         int nEmptySync = 0, fEmpty;
1632
1633         const int bmpheight = 8192;
1634         BITMAPFILEHEADER bmpFH;
1635         BITMAPINFOHEADER bmpIH;
1636         ZeroMemory(&bmpFH, sizeof(bmpFH));
1637         ZeroMemory(&bmpIH, sizeof(bmpIH));
1638
1639         bmpFH.bfType = 19778; //"BM"
1640         bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1) * bmpheight ;
1641         bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
1642
1643         bmpIH.biSize = sizeof(bmpIH);
1644         bmpIH.biWidth = fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1;
1645         bmpIH.biHeight = bmpheight;
1646         bmpIH.biPlanes = 1;
1647         bmpIH.biBitCount = 32;
1648
1649         fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
1650         fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
1651
1652         for (int i = 0;i < bmpheight * fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;i += fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR)
1653         {
1654                 fEmpty = 1;
1655                 for (int j = 0;j < fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;j++)
1656                 {
1657                         fprintf(fp, "%d\t", stageAtSync[i + j]);
1658                         int color = 0;
1659                         if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
1660                         if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
1661                         if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
1662                         if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
1663                         fwrite(&color, 1, sizeof(int), fp2);
1664                         if (j > 0 && j % 32 == 0)
1665                         {
1666                                 color = RGB(255, 255, 255);
1667                                 fwrite(&color, 1, 4, fp2);
1668                         }
1669                         if (stageAtSync[i + j]) fEmpty = 0;
1670                 }
1671                 fprintf(fp, "\n");
1672                 if (fEmpty) nEmptySync++;
1673                 else nEmptySync = 0;
1674                 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
1675         }
1676
1677         fclose(fp);
1678         fclose(fp2);
1679         free(stageAtSync);
1680 #endif
1681
1682         cuCtxPopCurrent((CUcontext*) fCudaContext);
1683         return(0);
1684 }
1685
1686 __global__ void ClearPPHitWeights(int sliceCount)
1687 {
1688         //Clear HitWeights
1689
1690         for (int k = 0;k < sliceCount;k++)
1691         {
1692                 AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k];
1693                 int4* const pHitWeights = (int4*) tracker.Data().HitWeights();
1694                 const int dwCount = tracker.Data().NumberOfHitsPlusAlign();
1695                 const int stride = blockDim.x * gridDim.x;
1696                 int4 i0;
1697                 i0.x = i0.y = i0.z = i0.w = 0;
1698
1699                 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < dwCount * sizeof(int) / sizeof(int4);i += stride)
1700                 {
1701                         pHitWeights[i] = i0;
1702                 }
1703         }
1704 }
1705
1706 int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
1707 {
1708         //Primary reconstruction function for small events (PP)
1709
1710         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
1711
1712         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
1713
1714         char* tmpSliceMemHost = (char*) SliceDataMemory(fHostLockedMemory, 0);
1715         char* tmpSliceMemGpu = (char*) SliceDataMemory(fGPUMemory, 0);
1716
1717         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1718         {
1719                 StandalonePerfTime(firstSlice + iSlice, 0);
1720
1721                 //Initialize GPU Slave Tracker
1722                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
1723                 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(tmpSliceMemHost, RowMemory(fHostLockedMemory, firstSlice + iSlice));
1724                 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
1725                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
1726                 {
1727                         HLTError("Insufficiant Slice Data Memory");
1728                         return(1);
1729                 }
1730
1731                 //Make this a GPU Tracker
1732                 fGpuTracker[iSlice].SetGPUTracker();
1733                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
1734
1735
1736                 fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice));
1737                 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
1738
1739                 tmpSliceMemHost += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
1740                 tmpSliceMemHost = alignPointer(tmpSliceMemHost, 64 * 1024);
1741                 tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
1742                 tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024);
1743
1744
1745                 //Set Pointers to GPU Memory
1746                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
1747
1748                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
1749                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
1750                 tmpMem = alignPointer(tmpMem, 64 * 1024);
1751
1752                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
1753                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
1754                 tmpMem = alignPointer(tmpMem, 64 * 1024);
1755
1756                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
1757                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1758                 tmpMem = alignPointer(tmpMem, 64 * 1024);
1759
1760                 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
1761                 {
1762                         HLTError("Insufficiant Track Memory");
1763                         return(1);
1764                 }
1765
1766                 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
1767                 {
1768                         HLTError("Insufficiant Global Memory");
1769                         return(1);
1770                 }
1771
1772                 //Initialize Startup Constants
1773                 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
1774                 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
1775                 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
1776                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
1777
1778                 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
1779
1780                 if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);
1781                 StandalonePerfTime(firstSlice + iSlice, 1);
1782         }
1783
1784 #ifdef HLTCA_GPU_TEXTURE_FETCH
1785         cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
1786         size_t offset;
1787         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
1788         {
1789                 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
1790                 return(1);
1791         }
1792         cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
1793         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
1794         {
1795                 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
1796                 return(1);
1797         }
1798         cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
1799         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
1800         {
1801                 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
1802                 return(1);
1803         }
1804 #endif
1805
1806         //Copy Tracker Object to GPU Memory
1807         if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
1808         CudaFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));
1809
1810         //Copy Data to GPU Global Memory
1811         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1812         {
1813                 CudaFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));
1814                 //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize());
1815         }
1816         //CudaFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));
1817         //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0)));
1818         CudaFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));
1819         CudaFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));
1820
1821         if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
1822         AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_FINDER>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows());
1823         if (CUDASync("Neighbours finder", 0, firstSlice)) return 1;
1824         StandalonePerfTime(firstSlice, 2);
1825         if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
1826         AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2);
1827         if (CUDASync("Neighbours Cleaner", 0, firstSlice)) return 1;
1828         StandalonePerfTime(firstSlice, 3);
1829         if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
1830         AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6);
1831         if (CUDASync("Start Hits Finder", 0, firstSlice)) return 1;
1832         StandalonePerfTime(firstSlice, 4);
1833
1834         ClearPPHitWeights <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(sliceCountLocal);
1835         if (CUDASync("Clear Hit Weights", 0, firstSlice)) return 1;
1836
1837         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1838         {
1839                 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1840         }
1841
1842         StandalonePerfTime(firstSlice, 7);
1843
1844         if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
1845         AliHLTTPCCATrackletConstructorGPUPP<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>(0, sliceCountLocal);
1846         if (CUDASync("Tracklet Constructor PP", 0, firstSlice)) return 1;
1847
1848         StandalonePerfTime(firstSlice, 8);
1849
1850         AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR>>>(0, sliceCountLocal);
1851         if (CUDASync("Tracklet Selector", 0, firstSlice)) return 1;
1852         StandalonePerfTime(firstSlice, 9);
1853
1854         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));
1855
1856         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1857         {
1858                 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
1859
1860                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));
1861                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));
1862
1863                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
1864                 {
1865                         HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
1866                         return(1);
1867                 }
1868                 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
1869                 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks;
1870                 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits;
1871         }
1872
1873         if (fGlobalTracking && sliceCountLocal == fgkNSlices)
1874         {
1875                 char tmpMemory[sizeof(AliHLTTPCCATracklet)
1876 #ifdef EXTERN_ROW_HITS
1877                 + HLTCA_ROW_COUNT * sizeof(int)
1878 #endif
1879                 + 16];
1880
1881                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1882                 {
1883                         if (fSlaveTrackers[iSlice].CommonMemory()->fNTracklets)
1884                         {
1885                                 HLTError("Slave tracker tracklets found where none expected, memory not freed!\n");
1886                         }
1887                         fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(&tmpMemory[0], 1, fConstructorBlockCount);
1888                         fSlaveTrackers[iSlice].CommonMemory()->fNTracklets = 1;
1889                 }
1890
1891                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1892                 {
1893                         int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
1894                         int sliceRight = (iSlice + 1) % (fgkNSlices / 2);
1895                         if (iSlice >= fgkNSlices / 2)
1896                         {
1897                                 sliceLeft += fgkNSlices / 2;
1898                                 sliceRight += fgkNSlices / 2;
1899                         }
1900                         fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS);
1901                 }
1902
1903                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1904                 {
1905                         printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits);
1906                 }
1907         }
1908
1909         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1910         {
1911                 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
1912                 fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare();
1913                 fSlaveTrackers[firstSlice + iSlice].WriteOutput();
1914         }
1915
1916         StandalonePerfTime(firstSlice, 10);
1917
1918         if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
1919
1920         return(0);
1921 }
1922
1923 int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
1924 {
1925         //Initialize Slice Tracker Parameter for a slave tracker
1926         fSlaveTrackers[iSlice].Initialize(param);
1927         if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
1928         {
1929                 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
1930                 return(1);
1931         }
1932         return(0);
1933 }
1934
1935 int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
1936 {
1937         //Uninitialize CUDA
1938         cuCtxPushCurrent(*((CUcontext*) fCudaContext));
1939
1940         cudaThreadSynchronize();
1941         if (fGPUMemory)
1942         {
1943                 cudaFree(fGPUMemory);
1944                 fGPUMemory = NULL;
1945         }
1946         if (fHostLockedMemory)
1947         {
1948                 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
1949                 {
1950                         cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
1951                 }
1952                 free(fpCudaStreams);
1953                 fGpuTracker = NULL;
1954                 cudaFreeHost(fHostLockedMemory);
1955         }
1956
1957         if (CudaFailedMsg(cudaThreadExit()))
1958         {
1959                 HLTError("Could not uninitialize GPU");
1960                 return(1);
1961         }
1962
1963         if (StopHelperThreads()) return(1);
1964         pthread_mutex_destroy((pthread_mutex_t*) fHelperMemMutex);
1965         free(fHelperMemMutex);
1966
1967         for (int i = 0;i < fgkNSlices;i++) pthread_mutex_destroy(&((pthread_mutex_t*) fSliceGlobalMutexes)[i]);
1968         free(fSliceGlobalMutexes);
1969
1970         cuCtxDestroy(*((CUcontext*) fCudaContext));
1971
1972         cudaDeviceReset();
1973
1974         HLTInfo("CUDA Uninitialized");
1975         fCudaInitialized = 0;
1976         return(0);
1977 }
1978
1979 void AliHLTTPCCAGPUTrackerNVCC::ResetHelperThreads(int helpers)
1980 {
1981         HLTImportant("Error occurred, GPU tracker helper threads will be reset (Number of threads %d/%d)", fNHelperThreads, fNCPUTrackers);
1982         for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
1983         {
1984                 fHelperParams[i].fReset = true;
1985                 if (helpers || i >= fNHelperThreads) pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
1986         }
1987         HLTImportant("GPU Tracker helper threads have ben reset");
1988 }
1989
1990 int AliHLTTPCCAGPUTrackerNVCC::StopHelperThreads()
1991 {
1992         if (fNSlaveThreads)
1993         {
1994                 for (int i = 0;i < fNSlaveThreads;i++)
1995                 {
1996                         fHelperParams[i].fTerminate = true;
1997                         if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]))
1998                         {
1999                                 HLTError("Error unlocking mutex to terminate slave");
2000                                 return(1);
2001                         }
2002                         if (pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]))
2003                         {
2004                                 HLTError("Error locking mutex");
2005                                 return(1);
2006                         }
2007                         if (pthread_join( *((pthread_t*) fHelperParams[i].fThreadId), NULL))
2008                         {
2009                                 HLTError("Error waiting for thread to terminate");
2010                                 return(1);
2011                         }
2012                         free(fHelperParams[i].fThreadId);
2013                         for (int j = 0;j < 2;j++)
2014                         {
2015                                 if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]))
2016                                 {
2017                                         HLTError("Error unlocking mutex before destroying");
2018                                         return(1);
2019                                 }
2020                                 pthread_mutex_destroy(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
2021                         }
2022                         free(fHelperParams[i].fMutex);
2023                 }
2024                 delete[] fHelperParams;
2025         }
2026         fNSlaveThreads = 0;
2027         return(0);
2028 }
2029
2030 void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
2031 {
2032         //Set Output Control Pointers
2033         fOutputControl = val;
2034         for (int i = 0;i < fgkNSlices;i++)
2035         {
2036                 fSlaveTrackers[i].SetOutputControl(val);
2037         }
2038 }
2039
2040 int AliHLTTPCCAGPUTrackerNVCC::GetThread()
2041 {
2042         //Get Thread ID
2043 #ifdef R__WIN32
2044         return((int) (size_t) GetCurrentThread());
2045 #else
2046         return((int) syscall (SYS_gettid));
2047 #endif
2048 }
2049
2050 unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)
2051 {
2052         //Returns pointer to PerfTimer i of slice iSlice
2053         return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);
2054 }
2055
2056 const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const
2057 {
2058         //Return Pointer to Output Control Structure
2059         return fOutputControl;
2060 }
2061
2062 int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const
2063 {
2064         //Return max slice count processable
2065         return(fSliceCount);
2066 }
2067
2068 char* AliHLTTPCCAGPUTrackerNVCC::MergerBaseMemory()
2069 {
2070         return(alignPointer((char*) fGPUMergerHostMemory, 1024 * 1024));
2071 }
2072
2073 int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger)
2074 {
2075 #ifndef HLTCA_GPU_MERGER
2076         HLTError("HLTCA_GPU_MERGER compile flag not set");
2077         return(1);
2078 #else
2079         if (!fCudaInitialized)
2080         {
2081                 HLTError("GPU Merger not initialized");
2082                 return(1);
2083         }
2084         unsigned long long int a, b, c, d, e;
2085         AliHLTTPCCATracker::StandaloneQueryFreq(&e);
2086
2087         char* gpumem = (char*) fGPUMergerMemory;
2088         float *X, *Y, *Z, *Angle;
2089         unsigned int *RowType;
2090         AliHLTTPCGMMergedTrack* tracks;
2091         float* field;
2092         AliHLTTPCCAParam* param;
2093
2094         gpumem = alignPointer(gpumem, 1024 * 1024);
2095
2096         AssignMemory(X, gpumem, Merger->NClusters());
2097         AssignMemory(Y, gpumem, Merger->NClusters());
2098         AssignMemory(Z, gpumem, Merger->NClusters());
2099         AssignMemory(Angle, gpumem, Merger->NClusters());
2100         AssignMemory(RowType, gpumem, Merger->NClusters());
2101         AssignMemory(tracks, gpumem, Merger->NOutputTracks());
2102         AssignMemory(field, gpumem, 6);
2103         AssignMemory(param, gpumem, 1);
2104
2105
2106         if ((size_t) (gpumem - (char*) fGPUMergerMemory) > (size_t) fGPUMergerMaxMemory)
2107         {
2108                 HLTError("Insufficiant GPU Merger Memory");
2109         }
2110
2111         cuCtxPushCurrent(*((CUcontext*) fCudaContext));
2112
2113         if (fDebugLevel >= 2) HLTInfo("Running GPU Merger (%d/%d)", Merger->NOutputTrackClusters(), Merger->NClusters());
2114         AliHLTTPCCATracker::StandaloneQueryTime(&a);
2115         CudaFailedMsg(cudaMemcpy(X, Merger->ClusterX(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2116         CudaFailedMsg(cudaMemcpy(Y, Merger->ClusterY(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2117         CudaFailedMsg(cudaMemcpy(Z, Merger->ClusterZ(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2118         CudaFailedMsg(cudaMemcpy(Angle, Merger->ClusterAngle(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2119         CudaFailedMsg(cudaMemcpy(RowType, Merger->ClusterRowType(), Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyHostToDevice));
2120         CudaFailedMsg(cudaMemcpy(tracks, Merger->OutputTracks(), Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyHostToDevice));
2121         CudaFailedMsg(cudaMemcpy(field, Merger->PolinomialFieldBz(), 6 * sizeof(float), cudaMemcpyHostToDevice));
2122         CudaFailedMsg(cudaMemcpy(param, fSlaveTrackers[0].pParam(), sizeof(AliHLTTPCCAParam), cudaMemcpyHostToDevice));
2123         AliHLTTPCCATracker::StandaloneQueryTime(&b);
2124         RefitTracks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(tracks, Merger->NOutputTracks(), field, X, Y, Z, RowType, Angle, param);
2125         CudaFailedMsg(cudaThreadSynchronize());
2126         AliHLTTPCCATracker::StandaloneQueryTime(&c);
2127         CudaFailedMsg(cudaMemcpy(Merger->ClusterX(), X, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2128         CudaFailedMsg(cudaMemcpy(Merger->ClusterY(), Y, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2129         CudaFailedMsg(cudaMemcpy(Merger->ClusterZ(), Z, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2130         CudaFailedMsg(cudaMemcpy(Merger->ClusterAngle(), Angle, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2131         CudaFailedMsg(cudaMemcpy(Merger->ClusterRowType(), RowType, Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyDeviceToHost));
2132         CudaFailedMsg(cudaMemcpy((void*) Merger->OutputTracks(), tracks, Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyDeviceToHost));
2133         CudaFailedMsg(cudaThreadSynchronize());
2134         AliHLTTPCCATracker::StandaloneQueryTime(&d);
2135         if (fDebugLevel >= 2) HLTInfo("GPU Merger Finished");
2136
2137         if (fDebugLevel > 0)
2138         {
2139                 int copysize = 4 * Merger->NOutputTrackClusters() * sizeof(float) + Merger->NOutputTrackClusters() * sizeof(unsigned int) + Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack) + 6 * sizeof(float) + sizeof(AliHLTTPCCAParam);
2140                 double speed = (double) copysize * (double) e / (double) (b - a) / 1e9;
2141                 printf("GPU Fit:\tCopy To:\t%lld us (%lf GB/s)\n", (b - a) * 1000000 / e, speed);
2142                 printf("\t\tFit:\t%lld us\n", (c - b) * 1000000 / e);
2143                 speed = (double) copysize * (double) e / (double) (d - c) / 1e9;
2144                 printf("\t\tCopy From:\t%lld us (%lf GB/s)\n", (d - c) * 1000000 / e, speed);
2145         }
2146
2147         cuCtxPopCurrent((CUcontext*) fCudaContext);
2148         return(0);
2149 #endif
2150 }
2151
2152 int AliHLTTPCCAGPUTrackerNVCC::IsInitialized()
2153 {
2154         return(fCudaInitialized);
2155 }
2156
2157 AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
2158 {
2159         return new AliHLTTPCCAGPUTrackerNVCC;
2160 }
2161
2162 void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)
2163 {
2164         delete ptr;
2165 }
2166