1 // **************************************************************************
2 // This file is property of and copyright by the ALICE HLT Project *
3 // ALICE Experiment at CERN, All rights reserved. *
5 // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
6 // Ivan Kisel <kisel@kip.uni-heidelberg.de> *
7 // David Rohr <drohr@kip.uni-heidelberg.de> *
8 // for The ALICE HLT Project. *
10 // Permission to use, copy, modify and distribute this software and its *
11 // documentation strictly for non-commercial purposes is hereby granted *
12 // without fee, provided that the above copyright notice appears in all *
13 // copies and that both the copyright notice and this permission notice *
14 // appear in the supporting documentation. The authors make no claims *
15 // about the suitability of this software for any purpose. It is *
16 // provided "as is" without express or implied warranty. *
18 //***************************************************************************
20 #define HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT 36
22 #include "AliHLTTPCCAGPUTrackerNVCC.h"
26 #include <sm_11_atomic_functions.h>
27 #include <sm_12_atomic_functions.h>
32 #include <sys/syscall.h>
33 #include <semaphore.h>
36 #include "AliHLTTPCCADef.h"
37 #include "AliHLTTPCCAGPUConfig.h"
39 #if defined(HLTCA_STANDALONE) & !defined(_WIN32)
46 //Disable assertions since they produce errors in GPU Code
52 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
53 #ifdef HLTCA_GPU_TEXTURE_FETCH
54 texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
55 texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
56 texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
59 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
60 #include "AliHLTTPCCATrackParam.cxx"
61 #include "AliHLTTPCCATrack.cxx"
63 #include "AliHLTTPCCAHitArea.cxx"
64 #include "AliHLTTPCCAGrid.cxx"
65 #include "AliHLTTPCCARow.cxx"
66 #include "AliHLTTPCCAParam.cxx"
67 #include "AliHLTTPCCATracker.cxx"
69 #include "AliHLTTPCCAProcess.h"
71 #include "AliHLTTPCCATrackletSelector.cxx"
72 #include "AliHLTTPCCANeighboursFinder.cxx"
73 #include "AliHLTTPCCANeighboursCleaner.cxx"
74 #include "AliHLTTPCCAStartHitsFinder.cxx"
75 #include "AliHLTTPCCAStartHitsSorter.cxx"
76 #include "AliHLTTPCCATrackletConstructor.cxx"
78 #ifdef HLTCA_GPU_MERGER
79 #include "AliHLTTPCGMMerger.h"
80 #include "AliHLTTPCGMTrackParam.cxx"
83 #include "MemoryAssignmentHelpers.h"
85 #ifndef HLTCA_STANDALONE
86 #include "AliHLTDefinitions.h"
87 #include "AliHLTSystem.h"
91 //#define RANDOM_ERROR || rand() % 500 == 1
93 ClassImp( AliHLTTPCCAGPUTrackerNVCC )
95 int AliHLTTPCCAGPUTrackerNVCC::GlobalTracking(int iSlice, int threadId, AliHLTTPCCAGPUTrackerNVCC::helperParam* hParam)
97 if (fDebugLevel >= 3) printf("GPU Tracker running Global Tracking for slice %d on thread %d\n", iSlice, threadId);
99 int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
100 int sliceRight = (iSlice + 1) % (fgkNSlices / 2);
101 if (iSlice >= fgkNSlices / 2)
103 sliceLeft += fgkNSlices / 2;
104 sliceRight += fgkNSlices / 2;
106 while (fSliceOutputReady < iSlice || fSliceOutputReady < sliceLeft || fSliceOutputReady < sliceRight)
108 if (hParam != NULL && hParam->fReset) return(1);
111 pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
112 pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
113 fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS);
114 pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
115 pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
117 fSliceLeftGlobalReady[sliceLeft] = 1;
118 fSliceRightGlobalReady[sliceRight] = 1;
119 if (fDebugLevel >= 3) printf("GPU Tracker finished Global Tracking for slice %d on thread %d\n", iSlice, threadId);
123 void* AliHLTTPCCAGPUTrackerNVCC::helperWrapper(void* arg)
125 AliHLTTPCCAGPUTrackerNVCC::helperParam* par = (AliHLTTPCCAGPUTrackerNVCC::helperParam*) arg;
126 AliHLTTPCCAGPUTrackerNVCC* cls = par->fCls;
128 AliHLTTPCCATracker* tmpTracker = new AliHLTTPCCATracker;
130 #ifdef HLTCA_STANDALONE
131 if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d starting", par->fNum);
134 #if defined(HLTCA_STANDALONE) & !defined(_WIN32)
137 CPU_SET(par->fNum * 2 + 2, &mask);
138 //sched_setaffinity(0, sizeof(mask), &mask);
141 while(pthread_mutex_lock(&((pthread_mutex_t*) par->fMutex)[0]) == 0 && par->fTerminate == false)
145 for (int i = 0;i < cls->fNSlicesPerCPUTracker;i++)
147 int myISlice = cls->fSliceCount - cls->fNCPUTrackers * cls->fNSlicesPerCPUTracker + (par->fNum - cls->fNHelperThreads) * cls->fNSlicesPerCPUTracker + i;
148 #ifdef HLTCA_STANDALONE
149 if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Doing full CPU tracking, Slice %d", par->fNum, myISlice);
153 tmpTracker->Initialize(cls->fSlaveTrackers[par->fFirstSlice + myISlice].Param());
154 tmpTracker->ReadEvent(&par->pClusterData[myISlice]);
155 tmpTracker->DoTracking();
156 tmpTracker->SetOutput(&par->pOutput[myISlice]);
157 pthread_mutex_lock((pthread_mutex_t*) cls->fHelperMemMutex);
158 tmpTracker->WriteOutputPrepare();
159 pthread_mutex_unlock((pthread_mutex_t*) cls->fHelperMemMutex);
160 tmpTracker->WriteOutput();
162 /*cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUSliceDataMemory((char*) new uint4[HLTCA_GPU_SLICE_DATA_MEMORY/sizeof(uint4)], (char*) new uint4[HLTCA_GPU_ROWS_MEMORY/sizeof(uint4)]);
163 cls->fSlaveTrackers[par->fFirstSlice + myISlice].ReadEvent(&par->pClusterData[myISlice]);
164 cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracklets(HLTCA_GPU_MAX_TRACKLETS);
165 cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersHits(par->pClusterData[myISlice].NumberOfClusters());
166 cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracks(HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
167 cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, cls->fConstructorBlockCount);
168 cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemorySize()/sizeof( uint4 ) + 100]), par->pClusterData[myISlice].NumberOfClusters());
169 cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTracksMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemorySize()/sizeof( uint4 ) + 100]), HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
170 cls->fSlaveTrackers[par->fFirstSlice + myISlice].DoTracking();
171 cls->WriteOutput(par->pOutput, par->fFirstSlice, myISlice, par->fNum + 1);
172 delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemory();
173 delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemory();
174 delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemory();*/
176 #ifdef HLTCA_STANDALONE
177 if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d", par->fNum, myISlice);
183 int mustRunSlice19 = 0;
184 for (int i = par->fNum + 1;i < par->fSliceCount;i += cls->fNHelperThreads + 1)
186 //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Running, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
189 if (cls->fUseGlobalTracking)
191 int realSlice = i + 1;
192 if (realSlice % (fgkNSlices / 2) < 1) realSlice -= fgkNSlices / 2;
194 if (realSlice % (fgkNSlices / 2) != 1)
196 cls->GlobalTracking(realSlice, par->fNum + 1, par);
205 while (cls->fSliceLeftGlobalReady[realSlice] == 0 || cls->fSliceRightGlobalReady[realSlice] == 0)
207 if (par->fReset) goto ResetHelperThread;
209 cls->WriteOutput(par->pOutput, par->fFirstSlice, realSlice, par->fNum + 1);
214 while (cls->fSliceOutputReady < i)
216 if (par->fReset) goto ResetHelperThread;
218 cls->WriteOutput(par->pOutput, par->fFirstSlice, i, par->fNum + 1);
223 cls->ReadEvent(par->pClusterData, par->fFirstSlice, i, par->fNum + 1);
226 //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
230 while (cls->fSliceLeftGlobalReady[19] == 0 || cls->fSliceRightGlobalReady[19] == 0)
232 if (par->fReset) goto ResetHelperThread;
234 cls->WriteOutput(par->pOutput, par->fFirstSlice, 19, par->fNum + 1);
238 cls->ResetThisHelperThread(par);
240 #ifdef HLTCA_STANDALONE
241 if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d terminating", par->fNum);
244 pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
249 void AliHLTTPCCAGPUTrackerNVCC::ResetThisHelperThread(AliHLTTPCCAGPUTrackerNVCC::helperParam* par)
251 if (par->fReset) HLTImportant("GPU Helper Thread %d reseting", par->fNum);
253 pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
256 #define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
258 AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :
261 fHostLockedMemory(NULL),
262 fGPUMergerMemory(NULL),
263 fGPUMergerHostMemory(NULL),
264 fGPUMergerMaxMemory(0),
266 fDebugMask(0xFFFFFFFF),
270 fSliceCount(HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT),
272 fOutputControl(NULL),
277 fConstructorBlockCount(30),
278 selectorBlockCount(30),
280 fNHelperThreads(HLTCA_GPU_DEFAULT_HELPER_THREADS),
282 fHelperMemMutex(NULL),
283 fSliceOutputReady(0),
284 fSliceGlobalMutexes(NULL),
286 fNSlicesPerCPUTracker(0),
288 fUseGlobalTracking(0),
291 fCudaContext = (void*) new CUcontext;
294 AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC()
296 delete (CUcontext*) fCudaContext;
299 void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)
301 //Release the global named semaphore that locks GPU Initialization
303 HANDLE* h = (HANDLE*) sem;
304 ReleaseSemaphore(*h, 1, NULL);
308 sem_t* pSem = (sem_t*) sem;
310 sem_unlink(SemLockName);
314 int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)
316 //Check constants for correct memory sizes
317 if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
319 HLTError("Insufficiant Tracker Object Memory for %d slices", sliceCount);
323 if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
325 HLTError("Insufficiant Common Memory");
329 if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
331 HLTError("Insufficiant Row Memory");
335 if (fDebugLevel >= 3)
337 HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", (int) sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, (int) (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize()), HLTCA_GPU_COMMON_MEMORY, (int) (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow)), HLTCA_GPU_ROWS_MEMORY);
342 int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
344 //Find best CUDA device, initialize and allocate memory
346 #if defined(HLTCA_STANDALONE) & !defined(_WIN32)
350 //sched_setaffinity(0, sizeof(mask), &mask);
353 if (sliceCount == -1) sliceCount = fSliceCount;
355 if (CheckMemorySizes(sliceCount)) return(1);
358 HANDLE* semLock = new HANDLE;
359 *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
360 if (*semLock == NULL)
362 HLTError("Error creating GPUInit Semaphore");
365 WaitForSingleObject(*semLock, INFINITE);
367 sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
368 if (semLock == SEM_FAILED)
370 HLTError("Error creating GPUInit Semaphore");
374 clock_gettime(CLOCK_REALTIME, &semtime);
375 semtime.tv_sec += 10;
376 while (sem_timedwait(semLock, &semtime) != 0)
378 HLTError("Global Lock for GPU initialisation was not released for 10 seconds, assuming another thread died");
379 HLTWarning("Resetting the global lock");
384 fThreadId = GetThread();
386 cudaDeviceProp fCudaDeviceProp;
388 fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
390 #ifdef HLTCA_GPU_MERGER
391 fGPUMergerMaxMemory = 2000000 * 5 * sizeof(float);
392 fGPUMemSize += fGPUMergerMaxMemory;
395 #ifndef CUDA_DEVICE_EMULATION
396 int count, bestDevice = -1;
397 long long int bestDeviceSpeed = 0, deviceSpeed;
398 if (CudaFailedMsg(cudaGetDeviceCount(&count)))
400 HLTError("Error getting CUDA Device Count");
401 ReleaseGlobalLock(semLock);
404 if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
406 const int reqVerMaj = 2;
407 const int reqVerMin = 0;
409 const int reqVerMaj = 1;
410 const int reqVerMin = 2;
412 for (int i = 0;i < count;i++)
414 if (fDebugLevel >= 4) printf("Examining device %d\n", i);
415 #if CUDA_VERSION > 3010
418 unsigned int free, total;
422 cuDeviceGet(&tmpDevice, i);
423 CUcontext tmpContext;
424 cuCtxCreate(&tmpContext, 0, tmpDevice);
425 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
426 cuCtxDestroy(tmpContext);
427 if (fDebugLevel >= 4) printf("Obtained current memory usage for device %d\n", i);
428 if (CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;
429 if (fDebugLevel >= 4) printf("Obtained device properties for device %d\n", i);
430 int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < reqVerMaj || (fCudaDeviceProp.major == reqVerMaj && fCudaDeviceProp.minor < reqVerMin)) && free >= fGPUMemSize + 100 * 1024 + 1024;
431 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
432 //if (sliceCount > fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER) deviceOK = 0;
435 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %lld / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, (long long int) free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
436 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free * (long long int) fCudaDeviceProp.major * (long long int) fCudaDeviceProp.major;
437 if (deviceOK && deviceSpeed > bestDeviceSpeed)
440 bestDeviceSpeed = deviceSpeed;
443 if (bestDevice == -1)
445 HLTWarning("No %sCUDA Device available, aborting CUDA Initialisation", count ? "appropriate " : "");
446 HLTInfo("Requiring Revision %d.%d, Mem: %lld, Multiprocessors: %d", reqVerMaj, reqVerMin, fGPUMemSize + 100 * 1024 * 1024, sliceCount);
447 ReleaseGlobalLock(semLock);
451 if (forceDeviceID == -1)
452 fCudaDevice = bestDevice;
454 fCudaDevice = forceDeviceID;
459 cudaGetDeviceProperties(&fCudaDeviceProp ,fCudaDevice );
461 if (fDebugLevel >= 1)
463 HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
464 HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);
465 HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);
466 HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
467 HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
468 HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);
469 HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
470 HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
471 HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
472 HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);
473 HLTInfo("major = %d", fCudaDeviceProp.major);
474 HLTInfo("minor = %d", fCudaDeviceProp.minor);
475 HLTInfo("clockRate = %d", fCudaDeviceProp.clockRate);
476 HLTInfo("memoryClockRate = %d", fCudaDeviceProp.memoryClockRate);
477 HLTInfo("multiProcessorCount = %d", fCudaDeviceProp.multiProcessorCount);
478 HLTInfo("textureAlignment = %lld", (unsigned long long int) fCudaDeviceProp.textureAlignment);
480 fConstructorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER;
481 selectorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_SELECTOR_MULTIPLIER;
483 if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
485 HLTError( "Unsupported CUDA Device" );
486 ReleaseGlobalLock(semLock);
490 if (cuCtxCreate((CUcontext*) fCudaContext, CU_CTX_SCHED_AUTO, fCudaDevice) != CUDA_SUCCESS)
492 HLTError("Could not set CUDA Device!");
493 ReleaseGlobalLock(semLock);
497 if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
499 HLTError("CUDA Memory Allocation Error");
501 ReleaseGlobalLock(semLock);
504 fGPUMergerMemory = ((char*) fGPUMemory) + fGPUMemSize - fGPUMergerMaxMemory;
505 ReleaseGlobalLock(semLock);
506 if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
507 int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
508 #ifdef HLTCA_GPU_MERGER
509 hostMemSize += fGPUMergerMaxMemory;
511 if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
513 cudaFree(fGPUMemory);
515 HLTError("Error allocating Page Locked Host Memory");
518 if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
519 fGPUMergerHostMemory = ((char*) fHostLockedMemory) + hostMemSize - fGPUMergerMaxMemory;
521 if (fDebugLevel >= 1)
523 CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
526 fSliceCount = sliceCount;
527 //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
528 fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
530 for (int i = 0;i < fgkNSlices;i++)
532 fSlaveTrackers[i].SetGPUTracker();
533 fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
534 fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
537 fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
538 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
539 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
541 if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
543 cudaFree(fGPUMemory);
544 cudaFreeHost(fHostLockedMemory);
546 HLTError("Error creating CUDA Stream");
551 if (StartHelperThreads()) return(1);
553 fHelperMemMutex = malloc(sizeof(pthread_mutex_t));
554 if (fHelperMemMutex == NULL)
556 HLTError("Memory allocation error");
557 cudaFree(fGPUMemory);
558 cudaFreeHost(fHostLockedMemory);
563 if (pthread_mutex_init((pthread_mutex_t*) fHelperMemMutex, NULL))
565 HLTError("Error creating pthread mutex");
566 cudaFree(fGPUMemory);
567 cudaFreeHost(fHostLockedMemory);
572 fSliceGlobalMutexes = malloc(sizeof(pthread_mutex_t) * fgkNSlices);
573 if (fSliceGlobalMutexes == NULL)
575 HLTError("Memory allocation error");
576 cudaFree(fGPUMemory);
577 cudaFreeHost(fHostLockedMemory);
581 for (int i = 0;i < fgkNSlices;i++)
583 if (pthread_mutex_init(&((pthread_mutex_t*) fSliceGlobalMutexes)[i], NULL))
585 HLTError("Error creating pthread mutex");
586 cudaFree(fGPUMemory);
587 cudaFreeHost(fHostLockedMemory);
593 cuCtxPopCurrent((CUcontext*) fCudaContext);
594 fCudaInitialized = 1;
595 HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", fCudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount);
597 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
598 if (fDebugLevel < 2 && 0)
600 //Do one initial run for Benchmark reasons
601 const int useDebugLevel = fDebugLevel;
603 AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
607 AliHLTTPCCAParam tmpParam;
608 AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
610 fin.open("events/settings.dump");
613 for (int i = 0;i < sliceCount;i++)
615 fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
616 tmpParam.ReadSettings(fin);
617 InitializeSliceParam(i, tmpParam);
621 fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
622 for (int i = 0;i < sliceCount;i++)
624 tmpCluster[i].StartReading(i, 0);
625 tmpCluster[i].ReadEvent(fin);
629 AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
630 memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
632 Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
633 for (int i = 0;i < sliceCount;i++)
637 fSlaveTrackers[i].SetOutputControl(NULL);
641 fDebugLevel = useDebugLevel;
648 int AliHLTTPCCAGPUTrackerNVCC::StartHelperThreads()
650 int nThreads = fNHelperThreads + fNCPUTrackers;
653 fHelperParams = new helperParam[nThreads];
654 if (fHelperParams == NULL)
656 HLTError("Memory allocation error");
657 cudaFree(fGPUMemory);
658 cudaFreeHost(fHostLockedMemory);
662 for (int i = 0;i < nThreads;i++)
664 fHelperParams[i].fCls = this;
665 fHelperParams[i].fTerminate = false;
666 fHelperParams[i].fReset = false;
667 fHelperParams[i].fNum = i;
668 fHelperParams[i].fMutex = malloc(2 * sizeof(pthread_mutex_t));
669 if (fHelperParams[i].fMutex == NULL)
671 HLTError("Memory allocation error");
672 cudaFree(fGPUMemory);
673 cudaFreeHost(fHostLockedMemory);
677 for (int j = 0;j < 2;j++)
679 if (pthread_mutex_init(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j], NULL))
681 HLTError("Error creating pthread mutex");
682 cudaFree(fGPUMemory);
683 cudaFreeHost(fHostLockedMemory);
688 pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
690 fHelperParams[i].fThreadId = (void*) malloc(sizeof(pthread_t));
692 if (pthread_create((pthread_t*) fHelperParams[i].fThreadId, NULL, helperWrapper, &fHelperParams[i]))
694 HLTError("Error starting slave thread");
695 cudaFree(fGPUMemory);
696 cudaFreeHost(fHostLockedMemory);
701 fNSlaveThreads = nThreads;
705 template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)
707 //Macro to align Pointers.
708 //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
709 //(As long as every single data structure is <= 1 MB)
711 size_t adr = (size_t) ptr;
714 adr += alignment - (adr % alignment);
719 bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsgA(cudaError_t error, const char* file, int line)
721 //Check for CUDA Error and in the case of an error display the corresponding error string
722 if (error == cudaSuccess) return(false);
723 HLTWarning("CUDA Error: %d / %s (%s:%d)", error, cudaGetErrorString(error), file, line);
727 int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice)
729 //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
731 if (fDebugLevel == 0) return(0);
733 cuErr = cudaGetLastError();
734 if (cuErr != cudaSuccess)
736 HLTError("Cuda Error %s while running kernel (%s) (Slice %d; %d/%d)", cudaGetErrorString(cuErr), state, sliceLocal, slice, fgkNSlices);
739 if (CudaFailedMsg(cudaThreadSynchronize()))
741 HLTError("CUDA Error while synchronizing (%s) (Slice %d; %d/%d)", state, sliceLocal, slice, fgkNSlices);
744 if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
748 void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
750 //Set Debug Level and Debug output File if applicable
751 fDebugLevel = dwLevel;
752 if (NewOutFile) fOutFile = NewOutFile;
755 int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int OptionValue)
757 //Set a specific GPU Tracker Option
758 if (strcmp(OptionName, "PPMode") == 0)
760 fPPMode = OptionValue;
762 else if (strcmp(OptionName, "DebugMask") == 0)
764 fDebugMask = OptionValue;
766 else if (strcmp(OptionName, "HelperThreads") == 0)
768 fNHelperThreads = OptionValue;
770 else if (strcmp(OptionName, "CPUTrackers") == 0)
772 fNCPUTrackers = OptionValue;
774 else if (strcmp(OptionName, "SlicesPerCPUTracker") == 0)
776 fNSlicesPerCPUTracker = OptionValue;
778 else if (strcmp(OptionName, "GlobalTracking") == 0)
780 fGlobalTracking = OptionValue;
784 HLTError("Unknown Option: %s", OptionName);
788 if (fNHelperThreads + fNCPUTrackers > fNSlaveThreads && fCudaInitialized)
790 HLTInfo("Insufficient Slave Threads available (%d), creating additional Slave Threads (%d+%d)\n", fNSlaveThreads, fNHelperThreads, fNCPUTrackers);
792 StartHelperThreads();
798 #ifdef HLTCA_STANDALONE
799 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)
801 //Run Performance Query for timer i of slice iSlice
802 if (fDebugLevel >= 1)
804 AliHLTTPCCATracker::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
808 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
811 #if defined(BITWISE_COMPATIBLE_DEBUG_OUTPUT) || defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER)
812 void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker*, int, bool) {}
814 void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
816 //Dump Rowblocks to File
817 if (fDebugLevel >= 4)
819 *fOutFile << "RowBlock Tracklets (Slice" << tracker[iSlice].Param().ISlice() << " (" << iSlice << " of reco))";
820 *fOutFile << " after Tracklet Reconstruction";
821 *fOutFile << std::endl;
823 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
824 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
825 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * fConstructorBlockCount);
826 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
827 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
828 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * fConstructorBlockCount, cudaMemcpyDeviceToHost));
829 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
831 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
832 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
834 *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
835 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
836 for (int j = 0;j < rowBlockPos[i].x;j++)
838 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
839 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
840 #ifdef HLTCA_GPU_SCHED_FIXED_START
841 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
843 HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
845 #endif //HLTCA_GPU_SCHED_FIXED_START
847 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
849 HLTError("Error, -1 Tracklet found");
852 *fOutFile << std::endl << "Phase 2: ";
853 for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
855 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
857 *fOutFile << std::endl;
862 *fOutFile << "Starting Threads: (Slice" << tracker[iSlice].Param().ISlice() << ", First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
863 for (int i = 0;i < fConstructorBlockCount;i++)
865 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
870 free(rowBlockTracklets);
871 free(blockStartingTracklet);
876 __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
878 //Initialize GPU RowBlocks and HitWeights
879 int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
880 const int stride = blockDim.x * gridDim.x;
882 i0.x = i0.y = i0.z = i0.w = 0;
883 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
884 int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
886 i1.x = i1.y = i1.z = i1.w = -1;
887 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
889 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
890 rowBlockTracklets4[i] = i1;
892 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
893 sliceDataHitWeights4[i] = i0;
896 int AliHLTTPCCAGPUTrackerNVCC::SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
900 cuCtxPopCurrent((CUcontext*) fCudaContext);
903 static bool selfHealing = false;
906 HLTError("Selfhealing failed, giving up");
907 cuCtxPopCurrent((CUcontext*) fCudaContext);
912 HLTError("Unsolvable CUDA error occured, trying to reinitialize GPU");
916 if (InitGPU(fSliceCount, fCudaDevice))
918 HLTError("Could not reinitialize CUDA device, disabling GPU tracker");
922 HLTInfo("GPU tracker successfully reinitialized, restarting tracking");
923 int retVal = Reconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal);
928 void AliHLTTPCCAGPUTrackerNVCC::ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId)
930 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
931 #ifdef HLTCA_GPU_TIME_PROFILE
932 unsigned long long int a, b;
933 AliHLTTPCCATracker::StandaloneQueryTime(&a);
935 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
936 #ifdef HLTCA_GPU_TIME_PROFILE
937 AliHLTTPCCATracker::StandaloneQueryTime(&b);
938 printf("Read %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
942 void AliHLTTPCCAGPUTrackerNVCC::WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId)
944 if (fDebugLevel >= 3) printf("GPU Tracker running WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);
945 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
946 #ifdef HLTCA_GPU_TIME_PROFILE
947 unsigned long long int a, b;
948 AliHLTTPCCATracker::StandaloneQueryTime(&a);
950 if (fNHelperThreads) pthread_mutex_lock((pthread_mutex_t*) fHelperMemMutex);
951 fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare();
952 if (fNHelperThreads) pthread_mutex_unlock((pthread_mutex_t*) fHelperMemMutex);
953 fSlaveTrackers[firstSlice + iSlice].WriteOutput();
954 #ifdef HLTCA_GPU_TIME_PROFILE
955 AliHLTTPCCATracker::StandaloneQueryTime(&b);
956 printf("Write %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
958 if (fDebugLevel >= 3) printf("GPU Tracker finished WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);
961 int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
963 //Primary reconstruction function
965 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
967 if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
969 if (!fCudaInitialized)
971 HLTError("GPUTracker not initialized");
974 if (sliceCountLocal > fSliceCount)
976 HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
979 if (fThreadId != GetThread())
981 HLTWarning("CUDA thread changed, migrating context, Previous Thread: %d, New Thread: %d", fThreadId, GetThread());
982 fThreadId = GetThread();
985 if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
987 if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
989 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
993 cuCtxPushCurrent(*((CUcontext*) fCudaContext));
996 int retVal = ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal);
997 cuCtxPopCurrent((CUcontext*) fCudaContext);
1001 for (int i = fNHelperThreads;i < fNCPUTrackers + fNHelperThreads;i++)
1003 fHelperParams[i].CPUTracker = 1;
1004 fHelperParams[i].pClusterData = pClusterData;
1005 fHelperParams[i].pOutput = pOutput;
1006 fHelperParams[i].fSliceCount = sliceCountLocal;
1007 fHelperParams[i].fFirstSlice = firstSlice;
1008 pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
1010 sliceCountLocal -= fNCPUTrackers * fNSlicesPerCPUTracker;
1011 if (sliceCountLocal < 0) sliceCountLocal = 0;
1013 fUseGlobalTracking = fGlobalTracking && sliceCountLocal == fgkNSlices;
1015 memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
1017 if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
1019 #ifdef HLTCA_GPU_TIME_PROFILE
1020 AliHLTTPCCATracker::StandaloneQueryFreq(&fProfTimeC);
1021 AliHLTTPCCATracker::StandaloneQueryTime(&fProfTimeD);
1024 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1026 //Make this a GPU Tracker
1027 fGpuTracker[iSlice].SetGPUTracker();
1028 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
1029 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
1030 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
1032 //Set Pointers to GPU Memory
1033 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
1035 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
1036 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
1037 tmpMem = alignPointer(tmpMem, 1024 * 1024);
1039 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
1040 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
1041 tmpMem = alignPointer(tmpMem, 1024 * 1024);
1043 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
1044 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1045 tmpMem = alignPointer(tmpMem, 1024 * 1024);
1047 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY RANDOM_ERROR)
1049 HLTError("Insufficiant Track Memory");
1050 cudaThreadSynchronize();
1051 cuCtxPopCurrent((CUcontext*) fCudaContext);
1052 ResetHelperThreads(0);
1056 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY RANDOM_ERROR)
1058 HLTError("Insufficiant Global Memory");
1059 cudaThreadSynchronize();
1060 cuCtxPopCurrent((CUcontext*) fCudaContext);
1061 ResetHelperThreads(0);
1065 if (fDebugLevel >= 3)
1067 HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", (int) (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice)), HLTCA_GPU_GLOBAL_MEMORY, (int) fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY);
1070 //Initialize Startup Constants
1071 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
1072 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
1073 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
1074 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = sliceCountLocal > fConstructorBlockCount ? (iSlice < fConstructorBlockCount) : fConstructorBlockCount * (iSlice + 1) / sliceCountLocal - fConstructorBlockCount * (iSlice) / sliceCountLocal;
1075 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
1076 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
1077 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
1078 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
1079 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fNextTracklet = (fConstructorBlockCount / sliceCountLocal + (fConstructorBlockCount % sliceCountLocal > iSlice)) * HLTCA_GPU_THREAD_COUNT;
1080 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
1083 #ifdef HLTCA_GPU_TEXTURE_FETCH
1084 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
1086 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
1088 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
1089 cudaThreadSynchronize();
1090 cuCtxPopCurrent((CUcontext*) fCudaContext);
1091 ResetHelperThreads(0);
1094 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
1095 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
1097 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
1098 cudaThreadSynchronize();
1099 cuCtxPopCurrent((CUcontext*) fCudaContext);
1100 ResetHelperThreads(0);
1103 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
1104 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
1106 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
1107 cudaThreadSynchronize();
1108 cuCtxPopCurrent((CUcontext*) fCudaContext);
1109 ResetHelperThreads(0);
1114 //Copy Tracker Object to GPU Memory
1115 if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
1116 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
1118 if (CudaFailedMsg(cudaMalloc(&tmpMem, 100000000)))
1120 HLTError("Error allocating CUDA profile memory");
1121 cudaThreadSynchronize();
1122 cuCtxPopCurrent((CUcontext*) fCudaContext);
1123 ResetHelperThreads(0);
1126 fGpuTracker[0].fStageAtSync = tmpMem;
1127 CudaFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));
1129 CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
1130 if (CUDASync("Initialization (1)", 0, firstSlice) RANDOM_ERROR)
1132 cudaThreadSynchronize();
1133 cuCtxPopCurrent((CUcontext*) fCudaContext);
1134 ResetHelperThreads(0);
1138 for (int i = 0;i < fNHelperThreads;i++)
1140 fHelperParams[i].CPUTracker = 0;
1141 fHelperParams[i].fDone = 0;
1142 fHelperParams[i].fPhase = 0;
1143 fHelperParams[i].pClusterData = pClusterData;
1144 fHelperParams[i].fSliceCount = sliceCountLocal;
1145 fHelperParams[i].fFirstSlice = firstSlice;
1146 pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
1149 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1151 StandalonePerfTime(firstSlice + iSlice, 0);
1153 //Initialize GPU Slave Tracker
1154 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data (Slice %d)", iSlice);
1155 if (iSlice % (fNHelperThreads + 1) == 0)
1157 ReadEvent(pClusterData, firstSlice, iSlice, 0);
1161 if (fDebugLevel >= 3) HLTInfo("Waiting for helper thread %d", iSlice % (fNHelperThreads + 1) - 1);
1162 while(fHelperParams[iSlice % (fNHelperThreads + 1) - 1].fDone < iSlice);
1165 if (fDebugLevel >= 4)
1167 #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
1168 *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl;
1170 if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);
1173 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY RANDOM_ERROR)
1175 HLTError("Insufficiant Slice Data Memory");
1176 cudaThreadSynchronize();
1177 cuCtxPopCurrent((CUcontext*) fCudaContext);
1178 ResetHelperThreads(1);
1182 if (fDebugLevel >= 3)
1184 HLTInfo("GPU Slice Data Memory Used: %d/%d", (int) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY);
1187 //Initialize temporary memory where needed
1188 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
1189 PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
1190 if (CUDASync("Initialization (2)", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1192 cudaThreadSynchronize();
1193 cuCtxPopCurrent((CUcontext*) fCudaContext);
1194 ResetHelperThreads(1);
1198 //Copy Data to GPU Global Memory
1199 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
1200 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
1201 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
1203 if (fDebugLevel >= 4)
1205 if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
1206 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
1207 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
1210 if (CUDASync("Initialization (3)", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1212 cudaThreadSynchronize();
1213 cuCtxPopCurrent((CUcontext*) fCudaContext);
1214 ResetHelperThreads(1);
1217 StandalonePerfTime(firstSlice + iSlice, 1);
1219 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);
1220 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), HLTCA_GPU_THREAD_COUNT_FINDER, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1222 if (CUDASync("Neighbours finder", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1224 cudaThreadSynchronize();
1225 cuCtxPopCurrent((CUcontext*) fCudaContext);
1226 ResetHelperThreads(1);
1230 StandalonePerfTime(firstSlice + iSlice, 2);
1232 if (fDebugLevel >= 4)
1234 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
1235 if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
1238 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);
1239 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1240 if (CUDASync("Neighbours Cleaner", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1242 cudaThreadSynchronize();
1243 cuCtxPopCurrent((CUcontext*) fCudaContext);
1244 ResetHelperThreads(1);
1248 StandalonePerfTime(firstSlice + iSlice, 3);
1250 if (fDebugLevel >= 4)
1252 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
1253 if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
1256 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);
1257 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1258 if (CUDASync("Start Hits Finder", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1260 cudaThreadSynchronize();
1261 cuCtxPopCurrent((CUcontext*) fCudaContext);
1262 ResetHelperThreads(1);
1266 StandalonePerfTime(firstSlice + iSlice, 4);
1268 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);
1269 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1270 if (CUDASync("Start Hits Sorter", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1272 cudaThreadSynchronize();
1273 cuCtxPopCurrent((CUcontext*) fCudaContext);
1274 ResetHelperThreads(1);
1278 StandalonePerfTime(firstSlice + iSlice, 5);
1280 if (fDebugLevel >= 2)
1282 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
1283 if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);
1284 if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS RANDOM_ERROR)
1286 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
1287 cudaThreadSynchronize();
1288 cuCtxPopCurrent((CUcontext*) fCudaContext);
1289 ResetHelperThreads(1);
1294 if (fDebugLevel >= 4 && *fSlaveTrackers[firstSlice + iSlice].NTracklets())
1296 #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
1297 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
1300 *fOutFile << "Temporary ";
1301 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
1303 uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
1304 CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
1305 if (fDebugMask & 16)
1307 *fOutFile << "Start Hits Sort Vector:" << std::endl;
1308 for (int i = 1;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 5;i++)
1310 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
1316 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
1317 if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
1320 StandalonePerfTime(firstSlice + iSlice, 6);
1322 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1325 for (int i = 0;i < fNHelperThreads;i++)
1327 pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
1330 StandalonePerfTime(firstSlice, 7);
1332 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
1333 int nHardCollisions = 0;
1335 RestartTrackletConstructor:
1336 if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
1337 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1339 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
1340 if (CUDASync("Tracklet Initializer", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1342 cudaThreadSynchronize();
1343 cuCtxPopCurrent((CUcontext*) fCudaContext);
1346 if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice);
1350 if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
1351 AliHLTTPCCATrackletConstructorGPU<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>();
1352 if (CUDASync("Tracklet Constructor", 0, firstSlice) RANDOM_ERROR)
1354 cudaThreadSynchronize();
1355 cuCtxPopCurrent((CUcontext*) fCudaContext);
1359 StandalonePerfTime(firstSlice, 8);
1361 if (fDebugLevel >= 4)
1363 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1365 if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
1366 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
1367 if (fDebugLevel >= 5)
1369 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
1371 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
1372 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
1373 if (0 && fSlaveTrackers[firstSlice + iSlice].NTracklets() && fSlaveTrackers[firstSlice + iSlice].Tracklet(0).NHits() < 0)
1375 cudaThreadSynchronize();
1376 cuCtxPopCurrent((CUcontext*) fCudaContext);
1377 printf("INTERNAL ERROR\n");
1380 if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
1385 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
1387 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
1388 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
1389 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
1390 if (CUDASync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1392 cudaThreadSynchronize();
1393 cuCtxPopCurrent((CUcontext*) fCudaContext);
1397 StandalonePerfTime(firstSlice, 9);
1399 char *tmpMemoryGlobalTracking = NULL;
1400 fSliceOutputReady = 0;
1401 if (fUseGlobalTracking)
1403 int tmpmemSize = sizeof(AliHLTTPCCATracklet)
1404 #ifdef EXTERN_ROW_HITS
1405 + HLTCA_ROW_COUNT * sizeof(int)
1408 tmpMemoryGlobalTracking = (char*) malloc(tmpmemSize * fgkNSlices);
1409 for (int i = 0;i < fgkNSlices;i++)
1411 fSliceLeftGlobalReady[i] = 0;
1412 fSliceRightGlobalReady[i] = 0;
1414 memset(fGlobalTrackingDone, 0, fgkNSlices);
1415 memset(fWriteOutputDone, 0, fgkNSlices);
1417 for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
1419 fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(tmpMemoryGlobalTracking + (tmpmemSize * iSlice), 1, fConstructorBlockCount);
1422 for (int i = 0;i < fNHelperThreads;i++)
1424 fHelperParams[i].fPhase = 1;
1425 fHelperParams[i].pOutput = pOutput;
1426 pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
1429 int tmpSlice = 0, tmpSlice2 = 0;
1430 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1432 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
1434 while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == (cudaError_t) CUDA_SUCCESS))
1436 if (CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice])) RANDOM_ERROR)
1438 ResetHelperThreads(1);
1439 cudaThreadSynchronize();
1440 return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
1445 while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == (cudaError_t) CUDA_SUCCESS)
1447 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
1448 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
1452 if (CudaFailedMsg(cudaStreamSynchronize(cudaStreams[iSlice])) RANDOM_ERROR)
1454 ResetHelperThreads(1);
1455 cudaThreadSynchronize();
1456 return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
1459 if (fDebugLevel >= 4)
1461 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
1462 #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
1463 if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
1465 if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
1468 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError RANDOM_ERROR)
1470 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
1471 if ((fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION || fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_WRONG_ROW)&& nHardCollisions++ < 10)
1473 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION)
1475 HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor (Slice %d)", firstSlice + iSlice);
1479 HLTWarning("Tracklet Constructor returned invalid row (Slice %d)", firstSlice + iSlice);
1481 if (fDebugLevel >= 4)
1483 ResetHelperThreads(1);
1484 cudaThreadSynchronize();
1485 cuCtxPopCurrent((CUcontext*) fCudaContext);
1488 for (int i = 0;i < sliceCountLocal;i++)
1490 cudaThreadSynchronize();
1491 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
1492 *fSlaveTrackers[firstSlice + i].NTracks() = 0;
1493 *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
1494 fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
1495 CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
1496 PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
1498 goto RestartTrackletConstructor;
1501 HLTError("GPU Tracker returned Error Code %d in slice %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError, firstSlice + iSlice);
1502 cudaThreadSynchronize();
1503 cuCtxPopCurrent((CUcontext*) fCudaContext);
1504 ResetHelperThreads(1);
1507 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
1509 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks;
1510 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits;
1511 if (fUseGlobalTracking) fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracklets = 1;
1513 if (fDebugLevel >= 3) HLTInfo("Data ready for slice %d, helper thread %d", iSlice, iSlice % (fNHelperThreads + 1));
1514 fSliceOutputReady = iSlice;
1516 if (fUseGlobalTracking)
1518 if (iSlice % (fgkNSlices / 2) == 2)
1520 int tmpId = iSlice % (fgkNSlices / 2) - 1;
1521 if (iSlice >= fgkNSlices / 2) tmpId += fgkNSlices / 2;
1522 GlobalTracking(tmpId, 0, NULL);
1523 fGlobalTrackingDone[tmpId] = 1;
1525 for (int tmpSlice3a = 0;tmpSlice3a < iSlice;tmpSlice3a += fNHelperThreads + 1)
1527 int tmpSlice3 = tmpSlice3a + 1;
1528 if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
1529 if (tmpSlice3 >= iSlice) break;
1531 int sliceLeft = (tmpSlice3 + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
1532 int sliceRight = (tmpSlice3 + 1) % (fgkNSlices / 2);
1533 if (tmpSlice3 >= fgkNSlices / 2)
1535 sliceLeft += fgkNSlices / 2;
1536 sliceRight += fgkNSlices / 2;
1539 if (tmpSlice3 % (fgkNSlices / 2) != 1 && fGlobalTrackingDone[tmpSlice3] == 0 && sliceLeft < iSlice && sliceRight < iSlice)
1541 GlobalTracking(tmpSlice3, 0, NULL);
1542 fGlobalTrackingDone[tmpSlice3] = 1;
1545 if (fWriteOutputDone[tmpSlice3] == 0 && fSliceLeftGlobalReady[tmpSlice3] && fSliceRightGlobalReady[tmpSlice3])
1547 WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
1548 fWriteOutputDone[tmpSlice3] = 1;
1554 if (iSlice % (fNHelperThreads + 1) == 0)
1556 WriteOutput(pOutput, firstSlice, iSlice, 0);
1560 if (fDebugLevel >= 4)
1562 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
1563 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
1567 if (fUseGlobalTracking)
1569 for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
1571 int tmpSlice3 = (tmpSlice3a + 1);
1572 if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
1573 if (fGlobalTrackingDone[tmpSlice3] == 0) GlobalTracking(tmpSlice3, 0, NULL);
1575 for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
1577 int tmpSlice3 = (tmpSlice3a + 1);
1578 if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
1579 if (fWriteOutputDone[tmpSlice3] == 0)
1581 while (fSliceLeftGlobalReady[tmpSlice3] == 0 || fSliceRightGlobalReady[tmpSlice3] == 0);
1582 WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
1587 for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
1589 pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
1592 if (fUseGlobalTracking)
1594 free(tmpMemoryGlobalTracking);
1595 if (fDebugLevel >= 3)
1597 for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
1599 printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits);
1604 StandalonePerfTime(firstSlice, 10);
1606 if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
1608 /*for (int i = firstSlice;i < firstSlice + sliceCountLocal;i++)
1610 fSlaveTrackers[i].DumpOutput(stdout);
1613 /*static int runnum = 0;
1614 std::ofstream tmpOut;
1616 sprintf(buffer, "GPUtracks%d.out", runnum++);
1617 tmpOut.open(buffer);
1618 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1620 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(tmpOut);
1624 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
1625 char* stageAtSync = (char*) malloc(100000000);
1626 CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
1627 cudaFree(fGpuTracker[0].StageAtSync());
1629 FILE* fp = fopen("profile.txt", "w+");
1630 FILE* fp2 = fopen("profile.bmp", "w+b");
1631 int nEmptySync = 0, fEmpty;
1633 const int bmpheight = 8192;
1634 BITMAPFILEHEADER bmpFH;
1635 BITMAPINFOHEADER bmpIH;
1636 ZeroMemory(&bmpFH, sizeof(bmpFH));
1637 ZeroMemory(&bmpIH, sizeof(bmpIH));
1639 bmpFH.bfType = 19778; //"BM"
1640 bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1) * bmpheight ;
1641 bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
1643 bmpIH.biSize = sizeof(bmpIH);
1644 bmpIH.biWidth = fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1;
1645 bmpIH.biHeight = bmpheight;
1647 bmpIH.biBitCount = 32;
1649 fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
1650 fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
1652 for (int i = 0;i < bmpheight * fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;i += fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR)
1655 for (int j = 0;j < fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;j++)
1657 fprintf(fp, "%d\t", stageAtSync[i + j]);
1659 if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
1660 if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
1661 if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
1662 if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
1663 fwrite(&color, 1, sizeof(int), fp2);
1664 if (j > 0 && j % 32 == 0)
1666 color = RGB(255, 255, 255);
1667 fwrite(&color, 1, 4, fp2);
1669 if (stageAtSync[i + j]) fEmpty = 0;
1672 if (fEmpty) nEmptySync++;
1673 else nEmptySync = 0;
1674 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
1682 cuCtxPopCurrent((CUcontext*) fCudaContext);
1686 __global__ void ClearPPHitWeights(int sliceCount)
1690 for (int k = 0;k < sliceCount;k++)
1692 AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k];
1693 int4* const pHitWeights = (int4*) tracker.Data().HitWeights();
1694 const int dwCount = tracker.Data().NumberOfHitsPlusAlign();
1695 const int stride = blockDim.x * gridDim.x;
1697 i0.x = i0.y = i0.z = i0.w = 0;
1699 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < dwCount * sizeof(int) / sizeof(int4);i += stride)
1701 pHitWeights[i] = i0;
1706 int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
1708 //Primary reconstruction function for small events (PP)
1710 memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
1712 if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
1714 char* tmpSliceMemHost = (char*) SliceDataMemory(fHostLockedMemory, 0);
1715 char* tmpSliceMemGpu = (char*) SliceDataMemory(fGPUMemory, 0);
1717 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1719 StandalonePerfTime(firstSlice + iSlice, 0);
1721 //Initialize GPU Slave Tracker
1722 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
1723 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(tmpSliceMemHost, RowMemory(fHostLockedMemory, firstSlice + iSlice));
1724 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
1725 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
1727 HLTError("Insufficiant Slice Data Memory");
1731 //Make this a GPU Tracker
1732 fGpuTracker[iSlice].SetGPUTracker();
1733 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
1736 fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice));
1737 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
1739 tmpSliceMemHost += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
1740 tmpSliceMemHost = alignPointer(tmpSliceMemHost, 64 * 1024);
1741 tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
1742 tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024);
1745 //Set Pointers to GPU Memory
1746 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
1748 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
1749 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
1750 tmpMem = alignPointer(tmpMem, 64 * 1024);
1752 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
1753 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
1754 tmpMem = alignPointer(tmpMem, 64 * 1024);
1756 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
1757 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1758 tmpMem = alignPointer(tmpMem, 64 * 1024);
1760 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
1762 HLTError("Insufficiant Track Memory");
1766 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
1768 HLTError("Insufficiant Global Memory");
1772 //Initialize Startup Constants
1773 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
1774 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
1775 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
1776 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
1778 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
1780 if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);
1781 StandalonePerfTime(firstSlice + iSlice, 1);
1784 #ifdef HLTCA_GPU_TEXTURE_FETCH
1785 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
1787 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
1789 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
1792 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
1793 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
1795 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
1798 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
1799 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
1801 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
1806 //Copy Tracker Object to GPU Memory
1807 if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
1808 CudaFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));
1810 //Copy Data to GPU Global Memory
1811 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1813 CudaFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));
1814 //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize());
1816 //CudaFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));
1817 //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0)));
1818 CudaFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));
1819 CudaFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));
1821 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
1822 AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_FINDER>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows());
1823 if (CUDASync("Neighbours finder", 0, firstSlice)) return 1;
1824 StandalonePerfTime(firstSlice, 2);
1825 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
1826 AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2);
1827 if (CUDASync("Neighbours Cleaner", 0, firstSlice)) return 1;
1828 StandalonePerfTime(firstSlice, 3);
1829 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
1830 AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6);
1831 if (CUDASync("Start Hits Finder", 0, firstSlice)) return 1;
1832 StandalonePerfTime(firstSlice, 4);
1834 ClearPPHitWeights <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(sliceCountLocal);
1835 if (CUDASync("Clear Hit Weights", 0, firstSlice)) return 1;
1837 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1839 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1842 StandalonePerfTime(firstSlice, 7);
1844 if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
1845 AliHLTTPCCATrackletConstructorGPUPP<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>(0, sliceCountLocal);
1846 if (CUDASync("Tracklet Constructor PP", 0, firstSlice)) return 1;
1848 StandalonePerfTime(firstSlice, 8);
1850 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR>>>(0, sliceCountLocal);
1851 if (CUDASync("Tracklet Selector", 0, firstSlice)) return 1;
1852 StandalonePerfTime(firstSlice, 9);
1854 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));
1856 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1858 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
1860 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));
1861 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));
1863 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
1865 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
1868 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
1869 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks;
1870 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits;
1873 if (fGlobalTracking && sliceCountLocal == fgkNSlices)
1875 char tmpMemory[sizeof(AliHLTTPCCATracklet)
1876 #ifdef EXTERN_ROW_HITS
1877 + HLTCA_ROW_COUNT * sizeof(int)
1881 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1883 if (fSlaveTrackers[iSlice].CommonMemory()->fNTracklets)
1885 HLTError("Slave tracker tracklets found where none expected, memory not freed!\n");
1887 fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(&tmpMemory[0], 1, fConstructorBlockCount);
1888 fSlaveTrackers[iSlice].CommonMemory()->fNTracklets = 1;
1891 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1893 int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
1894 int sliceRight = (iSlice + 1) % (fgkNSlices / 2);
1895 if (iSlice >= fgkNSlices / 2)
1897 sliceLeft += fgkNSlices / 2;
1898 sliceRight += fgkNSlices / 2;
1900 fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS);
1903 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1905 printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits);
1909 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1911 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
1912 fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare();
1913 fSlaveTrackers[firstSlice + iSlice].WriteOutput();
1916 StandalonePerfTime(firstSlice, 10);
1918 if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
1923 int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m)
1925 //Initialize Slice Tracker Parameter for a slave tracker
1926 fSlaveTrackers[iSlice].Initialize(param);
1927 if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
1929 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
1935 int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
1938 cuCtxPushCurrent(*((CUcontext*) fCudaContext));
1940 cudaThreadSynchronize();
1943 cudaFree(fGPUMemory);
1946 if (fHostLockedMemory)
1948 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
1950 cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
1952 free(fpCudaStreams);
1954 cudaFreeHost(fHostLockedMemory);
1957 if (CudaFailedMsg(cudaThreadExit()))
1959 HLTError("Could not uninitialize GPU");
1963 if (StopHelperThreads()) return(1);
1964 pthread_mutex_destroy((pthread_mutex_t*) fHelperMemMutex);
1965 free(fHelperMemMutex);
1967 for (int i = 0;i < fgkNSlices;i++) pthread_mutex_destroy(&((pthread_mutex_t*) fSliceGlobalMutexes)[i]);
1968 free(fSliceGlobalMutexes);
1970 cuCtxDestroy(*((CUcontext*) fCudaContext));
1974 HLTInfo("CUDA Uninitialized");
1975 fCudaInitialized = 0;
1979 void AliHLTTPCCAGPUTrackerNVCC::ResetHelperThreads(int helpers)
1981 HLTImportant("Error occurred, GPU tracker helper threads will be reset (Number of threads %d/%d)", fNHelperThreads, fNCPUTrackers);
1982 for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
1984 fHelperParams[i].fReset = true;
1985 if (helpers || i >= fNHelperThreads) pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
1987 HLTImportant("GPU Tracker helper threads have ben reset");
1990 int AliHLTTPCCAGPUTrackerNVCC::StopHelperThreads()
1994 for (int i = 0;i < fNSlaveThreads;i++)
1996 fHelperParams[i].fTerminate = true;
1997 if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]))
1999 HLTError("Error unlocking mutex to terminate slave");
2002 if (pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]))
2004 HLTError("Error locking mutex");
2007 if (pthread_join( *((pthread_t*) fHelperParams[i].fThreadId), NULL))
2009 HLTError("Error waiting for thread to terminate");
2012 free(fHelperParams[i].fThreadId);
2013 for (int j = 0;j < 2;j++)
2015 if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]))
2017 HLTError("Error unlocking mutex before destroying");
2020 pthread_mutex_destroy(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
2022 free(fHelperParams[i].fMutex);
2024 delete[] fHelperParams;
2030 void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
2032 //Set Output Control Pointers
2033 fOutputControl = val;
2034 for (int i = 0;i < fgkNSlices;i++)
2036 fSlaveTrackers[i].SetOutputControl(val);
2040 int AliHLTTPCCAGPUTrackerNVCC::GetThread()
2044 return((int) (size_t) GetCurrentThread());
2046 return((int) syscall (SYS_gettid));
2050 unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)
2052 //Returns pointer to PerfTimer i of slice iSlice
2053 return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);
2056 const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const
2058 //Return Pointer to Output Control Structure
2059 return fOutputControl;
2062 int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const
2064 //Return max slice count processable
2065 return(fSliceCount);
2068 char* AliHLTTPCCAGPUTrackerNVCC::MergerBaseMemory()
2070 return(alignPointer((char*) fGPUMergerHostMemory, 1024 * 1024));
2073 int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger)
2075 #ifndef HLTCA_GPU_MERGER
2076 HLTError("HLTCA_GPU_MERGER compile flag not set");
2079 if (!fCudaInitialized)
2081 HLTError("GPU Merger not initialized");
2084 unsigned long long int a, b, c, d, e;
2085 AliHLTTPCCATracker::StandaloneQueryFreq(&e);
2087 char* gpumem = (char*) fGPUMergerMemory;
2088 float *X, *Y, *Z, *Angle;
2089 unsigned int *RowType;
2090 AliHLTTPCGMMergedTrack* tracks;
2092 AliHLTTPCCAParam* param;
2094 gpumem = alignPointer(gpumem, 1024 * 1024);
2096 AssignMemory(X, gpumem, Merger->NClusters());
2097 AssignMemory(Y, gpumem, Merger->NClusters());
2098 AssignMemory(Z, gpumem, Merger->NClusters());
2099 AssignMemory(Angle, gpumem, Merger->NClusters());
2100 AssignMemory(RowType, gpumem, Merger->NClusters());
2101 AssignMemory(tracks, gpumem, Merger->NOutputTracks());
2102 AssignMemory(field, gpumem, 6);
2103 AssignMemory(param, gpumem, 1);
2106 if ((size_t) (gpumem - (char*) fGPUMergerMemory) > (size_t) fGPUMergerMaxMemory)
2108 HLTError("Insufficiant GPU Merger Memory");
2111 cuCtxPushCurrent(*((CUcontext*) fCudaContext));
2113 if (fDebugLevel >= 2) HLTInfo("Running GPU Merger (%d/%d)", Merger->NOutputTrackClusters(), Merger->NClusters());
2114 AliHLTTPCCATracker::StandaloneQueryTime(&a);
2115 CudaFailedMsg(cudaMemcpy(X, Merger->ClusterX(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2116 CudaFailedMsg(cudaMemcpy(Y, Merger->ClusterY(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2117 CudaFailedMsg(cudaMemcpy(Z, Merger->ClusterZ(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2118 CudaFailedMsg(cudaMemcpy(Angle, Merger->ClusterAngle(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2119 CudaFailedMsg(cudaMemcpy(RowType, Merger->ClusterRowType(), Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyHostToDevice));
2120 CudaFailedMsg(cudaMemcpy(tracks, Merger->OutputTracks(), Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyHostToDevice));
2121 CudaFailedMsg(cudaMemcpy(field, Merger->PolinomialFieldBz(), 6 * sizeof(float), cudaMemcpyHostToDevice));
2122 CudaFailedMsg(cudaMemcpy(param, fSlaveTrackers[0].pParam(), sizeof(AliHLTTPCCAParam), cudaMemcpyHostToDevice));
2123 AliHLTTPCCATracker::StandaloneQueryTime(&b);
2124 RefitTracks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(tracks, Merger->NOutputTracks(), field, X, Y, Z, RowType, Angle, param);
2125 CudaFailedMsg(cudaThreadSynchronize());
2126 AliHLTTPCCATracker::StandaloneQueryTime(&c);
2127 CudaFailedMsg(cudaMemcpy(Merger->ClusterX(), X, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2128 CudaFailedMsg(cudaMemcpy(Merger->ClusterY(), Y, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2129 CudaFailedMsg(cudaMemcpy(Merger->ClusterZ(), Z, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2130 CudaFailedMsg(cudaMemcpy(Merger->ClusterAngle(), Angle, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2131 CudaFailedMsg(cudaMemcpy(Merger->ClusterRowType(), RowType, Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyDeviceToHost));
2132 CudaFailedMsg(cudaMemcpy((void*) Merger->OutputTracks(), tracks, Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyDeviceToHost));
2133 CudaFailedMsg(cudaThreadSynchronize());
2134 AliHLTTPCCATracker::StandaloneQueryTime(&d);
2135 if (fDebugLevel >= 2) HLTInfo("GPU Merger Finished");
2137 if (fDebugLevel > 0)
2139 int copysize = 4 * Merger->NOutputTrackClusters() * sizeof(float) + Merger->NOutputTrackClusters() * sizeof(unsigned int) + Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack) + 6 * sizeof(float) + sizeof(AliHLTTPCCAParam);
2140 double speed = (double) copysize * (double) e / (double) (b - a) / 1e9;
2141 printf("GPU Fit:\tCopy To:\t%lld us (%lf GB/s)\n", (b - a) * 1000000 / e, speed);
2142 printf("\t\tFit:\t%lld us\n", (c - b) * 1000000 / e);
2143 speed = (double) copysize * (double) e / (double) (d - c) / 1e9;
2144 printf("\t\tCopy From:\t%lld us (%lf GB/s)\n", (d - c) * 1000000 / e, speed);
2147 cuCtxPopCurrent((CUcontext*) fCudaContext);
2152 int AliHLTTPCCAGPUTrackerNVCC::IsInitialized()
2154 return(fCudaInitialized);
2157 AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
2159 return new AliHLTTPCCAGPUTrackerNVCC;
2162 void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)