]> git.uio.no Git - u/mrichter/AliRoot.git/blame - HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu
bug fix: reconstruction crash when the output buffer size exceed
[u/mrichter/AliRoot.git] / HLT / TPCLib / tracking-ca / AliHLTTPCCAGPUTrackerNVCC.cu
CommitLineData
eb08490f 1// **************************************************************************
2// This file is property of and copyright by the ALICE HLT Project *
3// ALICE Experiment at CERN, All rights reserved. *
4// *
5// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
6// Ivan Kisel <kisel@kip.uni-heidelberg.de> *
7// David Rohr <drohr@kip.uni-heidelberg.de> *
8// for The ALICE HLT Project. *
9// *
10// Permission to use, copy, modify and distribute this software and its *
11// documentation strictly for non-commercial purposes is hereby granted *
12// without fee, provided that the above copyright notice appears in all *
13// copies and that both the copyright notice and this permission notice *
14// appear in the supporting documentation. The authors make no claims *
15// about the suitability of this software for any purpose. It is *
16// provided "as is" without express or implied warranty. *
17// *
18//***************************************************************************
19
31649d4b 20#include "AliHLTTPCCAGPUTrackerNVCC.h"
eb08490f 21
31649d4b 22#ifdef HLTCA_GPUCODE
3a4ab1ca 23#include <cuda.h>
31649d4b 24#include <sm_11_atomic_functions.h>
25#include <sm_12_atomic_functions.h>
26#endif
27
73a33d2e 28#ifdef R__WIN32
73a33d2e 29#else
3a4ab1ca 30#include <sys/syscall.h>
444e5682 31#include <semaphore.h>
32#include <fcntl.h>
73a33d2e 33#endif
3a4ab1ca 34
eb08490f 35#include "AliHLTTPCCADef.h"
36#include "AliHLTTPCCAGPUConfig.h"
37
eb08490f 38
39#include <iostream>
40
41//Disable assertions since they produce errors in GPU Code
42#ifdef assert
43#undef assert
44#endif
45#define assert(param)
46
47__constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
48#ifdef HLTCA_GPU_TEXTURE_FETCH
49texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
50texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
51texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
52#endif
53
eb08490f 54//Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
55#include "AliHLTTPCCATrackParam.cxx"
56#include "AliHLTTPCCATrack.cxx"
57
eb08490f 58#include "AliHLTTPCCAHitArea.cxx"
59#include "AliHLTTPCCAGrid.cxx"
60#include "AliHLTTPCCARow.cxx"
61#include "AliHLTTPCCAParam.cxx"
62#include "AliHLTTPCCATracker.cxx"
63
eb08490f 64#include "AliHLTTPCCAProcess.h"
65
8566066c 66#include "AliHLTTPCCATrackletSelector.cxx"
eb08490f 67#include "AliHLTTPCCANeighboursFinder.cxx"
eb08490f 68#include "AliHLTTPCCANeighboursCleaner.cxx"
69#include "AliHLTTPCCAStartHitsFinder.cxx"
70#include "AliHLTTPCCAStartHitsSorter.cxx"
71#include "AliHLTTPCCATrackletConstructor.cxx"
72#include "AliHLTTPCCASliceOutput.cxx"
73
74#include "MemoryAssignmentHelpers.h"
75
76#ifndef HLTCA_STANDALONE
77#include "AliHLTDefinitions.h"
78#include "AliHLTSystem.h"
79#endif
80
31649d4b 81ClassImp( AliHLTTPCCAGPUTrackerNVCC )
eb08490f 82
31649d4b 83bool AliHLTTPCCAGPUTrackerNVCC::fgGPUUsed = false;
eb08490f 84
444e5682 85#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
86
31649d4b 87AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :
88 fGpuTracker(NULL),
89 fGPUMemory(NULL),
90 fHostLockedMemory(NULL),
91 fDebugLevel(0),
92 fOutFile(NULL),
93 fGPUMemSize(0),
94 fpCudaStreams(NULL),
95 fSliceCount(0),
96 fOutputControl(NULL),
97 fThreadId(0),
98 fCudaInitialized(0)
99 {};
100
101AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() {};
102
103void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)
444e5682 104{
105 //Release the global named semaphore that locks GPU Initialization
106#ifdef R__WIN32
107 HANDLE* h = (HANDLE*) sem;
108 ReleaseSemaphore(*h, 1, NULL);
109 CloseHandle(*h);
110 delete h;
111#else
112 sem_t* pSem = (sem_t*) sem;
113 sem_post(pSem);
114 sem_unlink(SemLockName);
115#endif
116}
117
31649d4b 118int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)
444e5682 119{
120 //Check constants for correct memory sizes
121 if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
122 {
123 HLTError("Insufficiant Tracker Object Memory");
124 return(1);
125 }
126
127 if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
128 {
129 HLTError("Insufficiant Common Memory");
130 return(1);
131 }
132
133 if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
134 {
135 HLTError("Insufficiant Row Memory");
136 return(1);
137 }
138 return(0);
139}
140
31649d4b 141int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
eb08490f 142{
143 //Find best CUDA device, initialize and allocate memory
444e5682 144
145 if (CheckMemorySizes(sliceCount)) return(1);
146
147#ifdef R__WIN32
148 HANDLE* semLock = new HANDLE;
149 *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
150 if (*semLock == NULL)
151 {
152 HLTError("Error creating GPUInit Semaphore");
153 return(1);
154 }
155 WaitForSingleObject(*semLock, INFINITE);
156#else
157 sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
158 if (semLock == SEM_FAILED)
159 {
160 HLTError("Error creating GPUInit Semaphore");
161 return(1);
162 }
163 sem_wait(semLock);
164#endif
165
eb08490f 166 if (fgGPUUsed)
167 {
168 HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
444e5682 169 ReleaseGlobalLock(semLock);
eb08490f 170 return(1);
171 }
3a4ab1ca 172 fgGPUUsed = 1;
73a33d2e 173 fThreadId = GetThread();
eb08490f 174
175 cudaDeviceProp fCudaDeviceProp;
176
3a4ab1ca 177 fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
178
eb08490f 179#ifndef CUDA_DEVICE_EMULATION
3a4ab1ca 180 int count, bestDevice = -1;
181 long long int bestDeviceSpeed = 0, deviceSpeed;
eb08490f 182 if (CudaFailedMsg(cudaGetDeviceCount(&count)))
183 {
184 HLTError("Error getting CUDA Device Count");
3a4ab1ca 185 fgGPUUsed = 0;
444e5682 186 ReleaseGlobalLock(semLock);
eb08490f 187 return(1);
188 }
3a4ab1ca 189 if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
eb08490f 190 for (int i = 0;i < count;i++)
191 {
3a4ab1ca 192 unsigned int free, total;
193 cuInit(0);
194 CUdevice tmpDevice;
195 cuDeviceGet(&tmpDevice, i);
196 CUcontext tmpContext;
197 cuCtxCreate(&tmpContext, 0, tmpDevice);
198 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
199 cuCtxDestroy(tmpContext);
200 CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i));
201
202 int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;
203
31649d4b 204 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
3a4ab1ca 205 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;
206 if (deviceOK && deviceSpeed > bestDeviceSpeed)
eb08490f 207 {
208 bestDevice = i;
3a4ab1ca 209 bestDeviceSpeed = deviceSpeed;
eb08490f 210 }
211 }
eb08490f 212 if (bestDevice == -1)
213 {
214 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
3a4ab1ca 215 fgGPUUsed = 0;
444e5682 216 ReleaseGlobalLock(semLock);
eb08490f 217 return(1);
218 }
219
220 int cudaDevice;
221 if (forceDeviceID == -1)
222 cudaDevice = bestDevice;
223 else
224 cudaDevice = forceDeviceID;
225#else
226 int cudaDevice = 0;
227#endif
228
229 cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice );
230
231 if (fDebugLevel >= 1)
232 {
3a4ab1ca 233 HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
31649d4b 234 HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);
235 HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);
3a4ab1ca 236 HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
237 HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
31649d4b 238 HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);
3a4ab1ca 239 HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
240 HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
241 HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
31649d4b 242 HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);
3a4ab1ca 243 HLTInfo("major = %d", fCudaDeviceProp.major);
244 HLTInfo("minor = %d", fCudaDeviceProp.minor);
245 HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);
31649d4b 246 HLTInfo("textureAlignment %lld= ", (unsigned long long int) fCudaDeviceProp.textureAlignment);
eb08490f 247 }
248
249 if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
250 {
251 HLTError( "Unsupported CUDA Device" );
3a4ab1ca 252 fgGPUUsed = 0;
444e5682 253 ReleaseGlobalLock(semLock);
3a4ab1ca 254 return(1);
eb08490f 255 }
256
257 if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
258 {
259 HLTError("Could not set CUDA Device!");
3a4ab1ca 260 fgGPUUsed = 0;
444e5682 261 ReleaseGlobalLock(semLock);
eb08490f 262 return(1);
263 }
264
eb08490f 265 if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
266 {
267 HLTError("CUDA Memory Allocation Error");
268 cudaThreadExit();
3a4ab1ca 269 fgGPUUsed = 0;
444e5682 270 ReleaseGlobalLock(semLock);
eb08490f 271 return(1);
272 }
444e5682 273 ReleaseGlobalLock(semLock);
eb08490f 274 if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
275 int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
276 if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
277 {
278 cudaFree(fGPUMemory);
279 cudaThreadExit();
280 HLTError("Error allocating Page Locked Host Memory");
3a4ab1ca 281 fgGPUUsed = 0;
eb08490f 282 return(1);
283 }
284 if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
285
286 if (fDebugLevel >= 1)
287 {
288 CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
289 }
eb08490f 290
eb08490f 291 fSliceCount = sliceCount;
3a4ab1ca 292 //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
eb08490f 293 fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
294
295 for (int i = 0;i < fgkNSlices;i++)
296 {
297 fSlaveTrackers[i].SetGPUTracker();
298 fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
8566066c 299 fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
eb08490f 300 }
301
302 fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
303 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
304 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
305 {
306 if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
307 {
3a4ab1ca 308 cudaFree(fGPUMemory);
309 cudaFreeHost(fHostLockedMemory);
310 cudaThreadExit();
311 HLTError("Error creating CUDA Stream");
312 fgGPUUsed = 0;
313 return(1);
eb08490f 314 }
315 }
316
3a4ab1ca 317 fCudaInitialized = 1;
31649d4b 318 HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d)", cudaDevice, fCudaDeviceProp.name, fThreadId);
3a4ab1ca 319
eb08490f 320#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
321 if (fDebugLevel < 2)
322 {
323 //Do one initial run for Benchmark reasons
324 const int useDebugLevel = fDebugLevel;
325 fDebugLevel = 0;
8566066c 326 AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
eb08490f 327
328 std::ifstream fin;
eb08490f 329
eb08490f 330 AliHLTTPCCAParam tmpParam;
331 AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
8566066c 332
333 fin.open("events/settings.dump");
334 int tmpCount;
335 fin >> tmpCount;
336 for (int i = 0;i < sliceCount;i++)
337 {
338 fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
339 tmpParam.ReadSettings(fin);
340 InitializeSliceParam(i, tmpParam);
341 }
342 fin.close();
343
344 fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
345 for (int i = 0;i < sliceCount;i++)
346 {
347 tmpCluster[i].StartReading(i, 0);
348 tmpCluster[i].ReadEvent(fin);
349 tmpCluster[i].FinishReading();
350 }
351 fin.close();
352
353 AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
354 memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
355
356 Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
357 for (int i = 0;i < sliceCount;i++)
358 {
359 free(tmpOutput[i]);
360 tmpOutput[i] = NULL;
361 fSlaveTrackers[i].SetOutputControl(NULL);
362 }
363 delete[] tmpOutput;
364 delete[] tmpCluster;
eb08490f 365 fDebugLevel = useDebugLevel;
366 }
367#endif
eb08490f 368 return(0);
369}
370
31649d4b 371template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)
eb08490f 372{
373 //Macro to align Pointers.
374 //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
375 //(As long as every single data structure is <= 1 MB)
376
377 size_t adr = (size_t) ptr;
378 if (adr % alignment)
379 {
380 adr += alignment - (adr % alignment);
381 }
382 return((T*) adr);
383}
384
31649d4b 385bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsg(cudaError_t error)
eb08490f 386{
387 //Check for CUDA Error and in the case of an error display the corresponding error string
388 if (error == cudaSuccess) return(false);
389 HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
390 return(true);
391}
392
31649d4b 393int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state)
eb08490f 394{
395 //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
396
397 if (fDebugLevel == 0) return(0);
398 cudaError cuErr;
399 cuErr = cudaGetLastError();
400 if (cuErr != cudaSuccess)
401 {
402 HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state);
403 return(1);
404 }
405 if (CudaFailedMsg(cudaThreadSynchronize()))
406 {
407 HLTError("CUDA Error while synchronizing (%s)", state);
408 return(1);
409 }
410 if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
411 return(0);
412}
413
31649d4b 414void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
eb08490f 415{
416 //Set Debug Level and Debug output File if applicable
417 fDebugLevel = dwLevel;
418 if (NewOutFile) fOutFile = NewOutFile;
419}
420
31649d4b 421int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
eb08490f 422{
423 //Set a specific GPU Tracker Option
424 {
425 HLTError("Unknown Option: %s", OptionName);
426 return(1);
427 }
428 //No Options used at the moment
429 //return(0);
430}
431
432#ifdef HLTCA_STANDALONE
31649d4b 433void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)
eb08490f 434{
435 //Run Performance Query for timer i of slice iSlice
436 if (fDebugLevel >= 1)
437 {
438 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
439 }
440}
441#else
31649d4b 442void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
eb08490f 443#endif
444
31649d4b 445void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
eb08490f 446{
447 //Dump Rowblocks to File
448 if (fDebugLevel >= 4)
449 {
450 *fOutFile << "RowBlock Tracklets" << std::endl;
451
452 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
453 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
454 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
455 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
456 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
457 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
458 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
459
460 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
461 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
462 {
463 *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
8566066c 464 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
eb08490f 465 for (int j = 0;j < rowBlockPos[i].x;j++)
466 {
467 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
468 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
469 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
470 {
471 HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
472 }
473 k++;
474 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
475 {
476 HLTError("Error, -1 Tracklet found");
477 }
478 }
8566066c 479 *fOutFile << std::endl << "Phase 2: ";
eb08490f 480 for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
481 {
482 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
483 }
8566066c 484 *fOutFile << std::endl;
eb08490f 485 }
486
487 if (check)
488 {
489 *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
490 for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
491 {
492 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
493 }
494 }
495
496 free(rowBlockPos);
497 free(rowBlockTracklets);
498 free(blockStartingTracklet);
499 }
500}
501
502__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
503{
504 //Initialize GPU RowBlocks and HitWeights
505 int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
506 int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
507 const int stride = blockDim.x * gridDim.x;
508 int4 i0, i1;
509 i0.x = i0.y = i0.z = i0.w = 0;
510 i1.x = i1.y = i1.z = i1.w = -1;
511 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
512 RowBlockPos[i] = i0;
513 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
514 rowBlockTracklets4[i] = i1;
515 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
516 sliceDataHitWeights4[i] = i0;
517}
518
31649d4b 519int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
eb08490f 520{
521 //Primary reconstruction function
522 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
523
3a4ab1ca 524 if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
525
526 if (!fCudaInitialized)
527 {
528 HLTError("GPUTracker not initialized");
529 return(1);
530 }
531 if (sliceCountLocal > fSliceCount)
532 {
533 HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
534 return(1);
535 }
73a33d2e 536 if (fThreadId != GetThread())
3a4ab1ca 537 {
73a33d2e 538 HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());
3a4ab1ca 539 return(1);
540 }
541
542 if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
eb08490f 543
544 if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
545 {
546 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
547 return(1);
548 }
549
550 if (fDebugLevel >= 4)
551 {
552 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
553 {
8566066c 554 *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
eb08490f 555 }
556 }
557
558 memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
559
eb08490f 560 if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
561
562#ifdef HLTCA_GPU_TIME_PROFILE
c7d6f78b 563 unsigned __int64 a, b, c, d;
8566066c 564 AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
565 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
eb08490f 566#endif
567
568 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
569 {
570 //Make this a GPU Tracker
571 fGpuTracker[iSlice].SetGPUTracker();
572 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
8566066c 573 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
574 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
eb08490f 575
576 //Set Pointers to GPU Memory
577 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
578
579 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
580 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
581 tmpMem = alignPointer(tmpMem, 1024 * 1024);
582
583 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
584 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
585 tmpMem = alignPointer(tmpMem, 1024 * 1024);
586
587 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
588 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
589 tmpMem = alignPointer(tmpMem, 1024 * 1024);
590
591 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
592 {
593 HLTError("Insufficiant Track Memory");
594 return(1);
595 }
596
597 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
598 {
599 HLTError("Insufficiant Global Memory");
600 return(1);
601 }
602
603 //Initialize Startup Constants
604 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
605 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
606 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
607 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
608 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
609 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
610 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
611 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
8566066c 612 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
eb08490f 613 }
614
615#ifdef HLTCA_GPU_TEXTURE_FETCH
616 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
617 size_t offset;
618 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
619 {
3a4ab1ca 620 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
eb08490f 621 return(1);
622 }
623 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
624 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
625 {
3a4ab1ca 626 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
eb08490f 627 return(1);
628 }
629 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
630 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
631 {
3a4ab1ca 632 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
eb08490f 633 return(1);
634 }
635#endif
636
637 //Copy Tracker Object to GPU Memory
638 if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
639#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
640 if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1);
641 CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000));
642#endif
643 CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
644
645 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
646 {
647 StandalonePerfTime(firstSlice + iSlice, 0);
648
649 //Initialize GPU Slave Tracker
650 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
8566066c 651 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
eb08490f 652#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 653 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
eb08490f 654#endif
655 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
31649d4b 656
657 if (fDebugLevel >= 4)
658 {
659 *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
660 *fOutFile << "Slice Data:" << std::endl;
661 fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);
662 }
663
eb08490f 664#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 665 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
eb08490f 666 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
667#endif
668 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
669 {
670 HLTError("Insufficiant Slice Data Memory");
671 return(1);
672 }
673
674 /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice())
675 {
676 if (fDebugLevel >= 3) HLTInfo("Slice Empty, not running GPU Tracker");
677 if (sliceCountLocal == 1)
678 return(0);
679 }*/
680
681 //Initialize temporary memory where needed
682 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
683 PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
684
685 //Copy Data to GPU Global Memory
686 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
687 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
688 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
689
690 if (fDebugLevel >= 4)
691 {
692 if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
444e5682 693 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS);
694 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
eb08490f 695 }
696
697 if (CUDASync("Initialization")) return(1);
698 StandalonePerfTime(firstSlice + iSlice, 1);
699
700 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
701 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
702
703 if (CUDASync("Neighbours finder")) return 1;
704
705 StandalonePerfTime(firstSlice + iSlice, 2);
706
707 if (fDebugLevel >= 4)
708 {
709 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
710 fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
711 }
712
713 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
714 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
715 if (CUDASync("Neighbours Cleaner")) return 1;
716
717 StandalonePerfTime(firstSlice + iSlice, 3);
718
719 if (fDebugLevel >= 4)
720 {
721 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
722 fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
723 }
724
725 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
726 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
727 if (CUDASync("Start Hits Finder")) return 1;
728
729 StandalonePerfTime(firstSlice + iSlice, 4);
730
731 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter");
732 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
733 if (CUDASync("Start Hits Sorter")) return 1;
734
735 StandalonePerfTime(firstSlice + iSlice, 5);
736
737 if (fDebugLevel >= 2)
738 {
739 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
740 if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
741 if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
742 {
743 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
744 return(1);
745 }
746 }
747
748 if (fDebugLevel >= 4)
749 {
750 *fOutFile << "Temporary ";
751 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
752 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
753 uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
754 CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
755 *fOutFile << "Start Hits Sort Vector:" << std::endl;
756 for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
757 {
758 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
759 }
760 free(tmpMemory);
761
762 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
763 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
764 }
765
766 StandalonePerfTime(firstSlice + iSlice, 6);
767
768 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
769 }
770
771 StandalonePerfTime(firstSlice, 7);
772#ifdef HLTCA_GPU_PREFETCHDATA
773 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
774 {
775 if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
776 {
777 HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
778 return(1);
779 }
780 if (fDebugLevel >= 1)
781 {
782 static int infoShown = 0;
783 if (!infoShown)
784 {
785 HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
786 infoShown = 1;
787 }
788 }
789 }
790#endif
791
3a4ab1ca 792 int nHardCollisions = 0;
793
794RestartTrackletConstructor:
eb08490f 795 if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
796 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
797 {
798 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
799 if (CUDASync("Tracklet Initializer")) return 1;
800 DumpRowBlocks(fSlaveTrackers, iSlice);
801 }
802
803 if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
804 AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
805 if (CUDASync("Tracklet Constructor (new)")) return 1;
806
807 StandalonePerfTime(firstSlice, 8);
808
809 if (fDebugLevel >= 4)
810 {
811 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
812 {
813 DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
814 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
815 if (fDebugLevel >= 5)
816 {
817 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
818 }
819 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
820 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
821 fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
822 }
823 }
824
c7d6f78b 825 int runSlices = 0;
826 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
eb08490f 827 {
c7d6f78b 828 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
829 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
830 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
eb08490f 831 }
832 if (CUDASync("Tracklet Selector")) return 1;
833 StandalonePerfTime(firstSlice, 9);
834
835 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0]));
836 for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++)
837 {
838 if (iSliceTmp < sliceCountLocal)
839 {
840 int iSlice = iSliceTmp;
841 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
842 cudaStreamSynchronize(cudaStreams[iSlice]);
843 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
844 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
845 if (iSlice + 1 < sliceCountLocal)
846 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1]));
847 }
848
849 if (iSliceTmp)
850 {
851 int iSlice = iSliceTmp - 1;
852 cudaStreamSynchronize(cudaStreams[iSlice]);
853
854 if (fDebugLevel >= 4)
855 {
856 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
857 fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
858 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
859 }
860
861 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
862 {
3a4ab1ca 863 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION && nHardCollisions++ < 10)
864 {
865 HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");
866 for (int i = 0;i < sliceCountLocal;i++)
867 {
868 cudaThreadSynchronize();
869 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
870 *fSlaveTrackers[firstSlice + i].NTracks() = 0;
871 *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
872 fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
873 CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
874 PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
875 }
876 goto RestartTrackletConstructor;
877 }
eb08490f 878 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
879 return(1);
880 }
881 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
882
883 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
884#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 885 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
eb08490f 886#endif
887 fSlaveTrackers[firstSlice + iSlice].WriteOutput();
888#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 889 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
eb08490f 890 printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
891#endif
892
893 if (fDebugLevel >= 4)
894 {
895 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
896 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
897 }
898 }
899 }
900
901 StandalonePerfTime(firstSlice, 10);
902
903 if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
904
905#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
906 char* stageAtSync = (char*) malloc(100000000);
907 CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
908 cudaFree(fGpuTracker[0].fStageAtSync);
909
910 FILE* fp = fopen("profile.txt", "w+");
911 FILE* fp2 = fopen("profile.bmp", "w+b");
912 int nEmptySync = 0, fEmpty;
913
914 const int bmpheight = 1000;
915 BITMAPFILEHEADER bmpFH;
916 BITMAPINFOHEADER bmpIH;
917 ZeroMemory(&bmpFH, sizeof(bmpFH));
918 ZeroMemory(&bmpIH, sizeof(bmpIH));
919
920 bmpFH.bfType = 19778; //"BM"
921 bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
922 bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
923
924 bmpIH.biSize = sizeof(bmpIH);
925 bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
926 bmpIH.biHeight = bmpheight;
927 bmpIH.biPlanes = 1;
928 bmpIH.biBitCount = 32;
929
930 fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
931 fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
932
933 for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
934 {
935 fEmpty = 1;
936 for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
937 {
938 fprintf(fp, "%d\t", stageAtSync[i + j]);
939 int color = 0;
940 if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
941 if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
942 if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
943 if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
944 fwrite(&color, 1, sizeof(int), fp2);
945 if (j > 0 && j % 32 == 0)
946 {
947 color = RGB(255, 255, 255);
948 fwrite(&color, 1, 4, fp2);
949 }
950 if (stageAtSync[i + j]) fEmpty = 0;
951 }
952 fprintf(fp, "\n");
953 if (fEmpty) nEmptySync++;
954 else nEmptySync = 0;
955 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
956 }
957
958 fclose(fp);
959 fclose(fp2);
960 free(stageAtSync);
961#endif
962
963 return(0);
964}
965
31649d4b 966int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
eb08490f 967{
968 //Initialize Slice Tracker Parameter for a slave tracker
969 fSlaveTrackers[iSlice].Initialize(param);
970 if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
971 {
972 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
973 return(1);
974 }
975 return(0);
976}
977
31649d4b 978int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
eb08490f 979{
980 //Uninitialize CUDA
981 cudaThreadSynchronize();
982 if (fGPUMemory)
983 {
984 cudaFree(fGPUMemory);
985 fGPUMemory = NULL;
986 }
987 if (fHostLockedMemory)
988 {
989 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
990 {
991 cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
992 }
993 free(fpCudaStreams);
994 fGpuTracker = NULL;
995 cudaFreeHost(fHostLockedMemory);
996 }
997
998 if (CudaFailedMsg(cudaThreadExit()))
999 {
1000 HLTError("Could not uninitialize GPU");
1001 return(1);
1002 }
1003 HLTInfo("CUDA Uninitialized");
1004 fgGPUUsed = false;
3a4ab1ca 1005 fCudaInitialized = 0;
eb08490f 1006 return(0);
1007}
1008
31649d4b 1009void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
eb08490f 1010{
444e5682 1011 //Set Output Control Pointers
eb08490f 1012 fOutputControl = val;
1013 for (int i = 0;i < fgkNSlices;i++)
1014 {
1015 fSlaveTrackers[i].SetOutputControl(val);
1016 }
1017}
1018
31649d4b 1019int AliHLTTPCCAGPUTrackerNVCC::GetThread()
73a33d2e 1020{
444e5682 1021 //Get Thread ID
73a33d2e 1022#ifdef R__WIN32
1023 return((int) (size_t) GetCurrentThread());
1024#else
1025 return((int) syscall (SYS_gettid));
1026#endif
1027}
1028
31649d4b 1029unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)
1030{
1031 //Returns pointer to PerfTimer i of slice iSlice
1032 return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);
1033}
1034
1035const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const
1036{
1037 //Return Pointer to Output Control Structure
1038 return fOutputControl;
1039}
1040
1041int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const
1042{
1043 //Return max slice count processable
1044 return(fSliceCount);
1045}
1046
1047AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
1048{
1049 return new AliHLTTPCCAGPUTrackerNVCC;
1050}
1051void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)
1052{
1053 delete ptr;
1054}
1055