]> git.uio.no Git - u/mrichter/AliRoot.git/blame - HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu
updae of the GPU framework
[u/mrichter/AliRoot.git] / HLT / TPCLib / tracking-ca / AliHLTTPCCAGPUTrackerNVCC.cu
CommitLineData
eb08490f 1// **************************************************************************
2// This file is property of and copyright by the ALICE HLT Project *
3// ALICE Experiment at CERN, All rights reserved. *
4// *
5// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
6// Ivan Kisel <kisel@kip.uni-heidelberg.de> *
7// David Rohr <drohr@kip.uni-heidelberg.de> *
8// for The ALICE HLT Project. *
9// *
10// Permission to use, copy, modify and distribute this software and its *
11// documentation strictly for non-commercial purposes is hereby granted *
12// without fee, provided that the above copyright notice appears in all *
13// copies and that both the copyright notice and this permission notice *
14// appear in the supporting documentation. The authors make no claims *
15// about the suitability of this software for any purpose. It is *
16// provided "as is" without express or implied warranty. *
17// *
18//***************************************************************************
19
20#include "AliHLTTPCCAGPUTracker.h"
21
22#ifdef BUILD_GPU
23
3a4ab1ca 24#include <cuda.h>
73a33d2e 25#ifdef R__WIN32
73a33d2e 26#else
3a4ab1ca 27#include <sys/syscall.h>
444e5682 28#include <semaphore.h>
29#include <fcntl.h>
73a33d2e 30#endif
3a4ab1ca 31
eb08490f 32#include "AliHLTTPCCADef.h"
33#include "AliHLTTPCCAGPUConfig.h"
34
35#include <sm_11_atomic_functions.h>
36#include <sm_12_atomic_functions.h>
37
38#include <iostream>
39
40//Disable assertions since they produce errors in GPU Code
41#ifdef assert
42#undef assert
43#endif
44#define assert(param)
45
46__constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
47#ifdef HLTCA_GPU_TEXTURE_FETCH
48texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
49texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
50texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
51#endif
52
eb08490f 53//Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
54#include "AliHLTTPCCATrackParam.cxx"
55#include "AliHLTTPCCATrack.cxx"
56
eb08490f 57#include "AliHLTTPCCAHitArea.cxx"
58#include "AliHLTTPCCAGrid.cxx"
59#include "AliHLTTPCCARow.cxx"
60#include "AliHLTTPCCAParam.cxx"
61#include "AliHLTTPCCATracker.cxx"
62
eb08490f 63#include "AliHLTTPCCAProcess.h"
64
8566066c 65#include "AliHLTTPCCATrackletSelector.cxx"
eb08490f 66#include "AliHLTTPCCANeighboursFinder.cxx"
eb08490f 67#include "AliHLTTPCCANeighboursCleaner.cxx"
68#include "AliHLTTPCCAStartHitsFinder.cxx"
69#include "AliHLTTPCCAStartHitsSorter.cxx"
70#include "AliHLTTPCCATrackletConstructor.cxx"
71#include "AliHLTTPCCASliceOutput.cxx"
72
73#include "MemoryAssignmentHelpers.h"
74
75#ifndef HLTCA_STANDALONE
76#include "AliHLTDefinitions.h"
77#include "AliHLTSystem.h"
78#endif
79
80ClassImp( AliHLTTPCCAGPUTracker )
81
82bool AliHLTTPCCAGPUTracker::fgGPUUsed = false;
83
444e5682 84#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
85
86void AliHLTTPCCAGPUTracker::ReleaseGlobalLock(void* sem)
87{
88 //Release the global named semaphore that locks GPU Initialization
89#ifdef R__WIN32
90 HANDLE* h = (HANDLE*) sem;
91 ReleaseSemaphore(*h, 1, NULL);
92 CloseHandle(*h);
93 delete h;
94#else
95 sem_t* pSem = (sem_t*) sem;
96 sem_post(pSem);
97 sem_unlink(SemLockName);
98#endif
99}
100
101int AliHLTTPCCAGPUTracker::CheckMemorySizes(int sliceCount)
102{
103 //Check constants for correct memory sizes
104 if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
105 {
106 HLTError("Insufficiant Tracker Object Memory");
107 return(1);
108 }
109
110 if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
111 {
112 HLTError("Insufficiant Common Memory");
113 return(1);
114 }
115
116 if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
117 {
118 HLTError("Insufficiant Row Memory");
119 return(1);
120 }
121 return(0);
122}
123
eb08490f 124int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
125{
126 //Find best CUDA device, initialize and allocate memory
444e5682 127
128 if (CheckMemorySizes(sliceCount)) return(1);
129
130#ifdef R__WIN32
131 HANDLE* semLock = new HANDLE;
132 *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
133 if (*semLock == NULL)
134 {
135 HLTError("Error creating GPUInit Semaphore");
136 return(1);
137 }
138 WaitForSingleObject(*semLock, INFINITE);
139#else
140 sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
141 if (semLock == SEM_FAILED)
142 {
143 HLTError("Error creating GPUInit Semaphore");
144 return(1);
145 }
146 sem_wait(semLock);
147#endif
148
eb08490f 149 if (fgGPUUsed)
150 {
151 HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
444e5682 152 ReleaseGlobalLock(semLock);
eb08490f 153 return(1);
154 }
3a4ab1ca 155 fgGPUUsed = 1;
73a33d2e 156 fThreadId = GetThread();
eb08490f 157
158 cudaDeviceProp fCudaDeviceProp;
159
3a4ab1ca 160 fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
161
eb08490f 162#ifndef CUDA_DEVICE_EMULATION
3a4ab1ca 163 int count, bestDevice = -1;
164 long long int bestDeviceSpeed = 0, deviceSpeed;
eb08490f 165 if (CudaFailedMsg(cudaGetDeviceCount(&count)))
166 {
167 HLTError("Error getting CUDA Device Count");
3a4ab1ca 168 fgGPUUsed = 0;
444e5682 169 ReleaseGlobalLock(semLock);
eb08490f 170 return(1);
171 }
3a4ab1ca 172 if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
eb08490f 173 for (int i = 0;i < count;i++)
174 {
3a4ab1ca 175 unsigned int free, total;
176 cuInit(0);
177 CUdevice tmpDevice;
178 cuDeviceGet(&tmpDevice, i);
179 CUcontext tmpContext;
180 cuCtxCreate(&tmpContext, 0, tmpDevice);
181 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
182 cuCtxDestroy(tmpContext);
183 CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i));
184
185 int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;
186
187 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %d)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
188 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;
189 if (deviceOK && deviceSpeed > bestDeviceSpeed)
eb08490f 190 {
191 bestDevice = i;
3a4ab1ca 192 bestDeviceSpeed = deviceSpeed;
eb08490f 193 }
194 }
eb08490f 195 if (bestDevice == -1)
196 {
197 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
3a4ab1ca 198 fgGPUUsed = 0;
444e5682 199 ReleaseGlobalLock(semLock);
eb08490f 200 return(1);
201 }
202
203 int cudaDevice;
204 if (forceDeviceID == -1)
205 cudaDevice = bestDevice;
206 else
207 cudaDevice = forceDeviceID;
208#else
209 int cudaDevice = 0;
210#endif
211
212 cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice );
213
214 if (fDebugLevel >= 1)
215 {
3a4ab1ca 216 HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
217 HLTInfo("totalGlobalMem = %d", fCudaDeviceProp.totalGlobalMem);
218 HLTInfo("sharedMemPerBlock = %d", fCudaDeviceProp.sharedMemPerBlock);
219 HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
220 HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
221 HLTInfo("memPitch = %d", fCudaDeviceProp.memPitch);
222 HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
223 HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
224 HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
225 HLTInfo("totalConstMem = %d", fCudaDeviceProp.totalConstMem);
226 HLTInfo("major = %d", fCudaDeviceProp.major);
227 HLTInfo("minor = %d", fCudaDeviceProp.minor);
228 HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);
229 HLTInfo("textureAlignment %d= ", fCudaDeviceProp.textureAlignment);
eb08490f 230 }
231
232 if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
233 {
234 HLTError( "Unsupported CUDA Device" );
3a4ab1ca 235 fgGPUUsed = 0;
444e5682 236 ReleaseGlobalLock(semLock);
3a4ab1ca 237 return(1);
eb08490f 238 }
239
240 if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
241 {
242 HLTError("Could not set CUDA Device!");
3a4ab1ca 243 fgGPUUsed = 0;
444e5682 244 ReleaseGlobalLock(semLock);
eb08490f 245 return(1);
246 }
247
eb08490f 248 if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
249 {
250 HLTError("CUDA Memory Allocation Error");
251 cudaThreadExit();
3a4ab1ca 252 fgGPUUsed = 0;
444e5682 253 ReleaseGlobalLock(semLock);
eb08490f 254 return(1);
255 }
444e5682 256 ReleaseGlobalLock(semLock);
eb08490f 257 if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
258 int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
259 if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
260 {
261 cudaFree(fGPUMemory);
262 cudaThreadExit();
263 HLTError("Error allocating Page Locked Host Memory");
3a4ab1ca 264 fgGPUUsed = 0;
eb08490f 265 return(1);
266 }
267 if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
268
269 if (fDebugLevel >= 1)
270 {
271 CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
272 }
eb08490f 273
eb08490f 274 fSliceCount = sliceCount;
3a4ab1ca 275 //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
eb08490f 276 fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
277
278 for (int i = 0;i < fgkNSlices;i++)
279 {
280 fSlaveTrackers[i].SetGPUTracker();
281 fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
8566066c 282 fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
eb08490f 283 }
284
285 fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
286 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
287 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
288 {
289 if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
290 {
3a4ab1ca 291 cudaFree(fGPUMemory);
292 cudaFreeHost(fHostLockedMemory);
293 cudaThreadExit();
294 HLTError("Error creating CUDA Stream");
295 fgGPUUsed = 0;
296 return(1);
eb08490f 297 }
298 }
299
3a4ab1ca 300 fCudaInitialized = 1;
301 HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %dd)", cudaDevice, fCudaDeviceProp.name, fThreadId);
302
eb08490f 303#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
304 if (fDebugLevel < 2)
305 {
306 //Do one initial run for Benchmark reasons
307 const int useDebugLevel = fDebugLevel;
308 fDebugLevel = 0;
8566066c 309 AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
eb08490f 310
311 std::ifstream fin;
eb08490f 312
eb08490f 313 AliHLTTPCCAParam tmpParam;
314 AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
8566066c 315
316 fin.open("events/settings.dump");
317 int tmpCount;
318 fin >> tmpCount;
319 for (int i = 0;i < sliceCount;i++)
320 {
321 fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
322 tmpParam.ReadSettings(fin);
323 InitializeSliceParam(i, tmpParam);
324 }
325 fin.close();
326
327 fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
328 for (int i = 0;i < sliceCount;i++)
329 {
330 tmpCluster[i].StartReading(i, 0);
331 tmpCluster[i].ReadEvent(fin);
332 tmpCluster[i].FinishReading();
333 }
334 fin.close();
335
336 AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
337 memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
338
339 Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
340 for (int i = 0;i < sliceCount;i++)
341 {
342 free(tmpOutput[i]);
343 tmpOutput[i] = NULL;
344 fSlaveTrackers[i].SetOutputControl(NULL);
345 }
346 delete[] tmpOutput;
347 delete[] tmpCluster;
eb08490f 348 fDebugLevel = useDebugLevel;
349 }
350#endif
eb08490f 351 return(0);
352}
353
354template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment)
355{
356 //Macro to align Pointers.
357 //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
358 //(As long as every single data structure is <= 1 MB)
359
360 size_t adr = (size_t) ptr;
361 if (adr % alignment)
362 {
363 adr += alignment - (adr % alignment);
364 }
365 return((T*) adr);
366}
367
368bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
369{
370 //Check for CUDA Error and in the case of an error display the corresponding error string
371 if (error == cudaSuccess) return(false);
372 HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
373 return(true);
374}
375
376int AliHLTTPCCAGPUTracker::CUDASync(char* state)
377{
378 //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
379
380 if (fDebugLevel == 0) return(0);
381 cudaError cuErr;
382 cuErr = cudaGetLastError();
383 if (cuErr != cudaSuccess)
384 {
385 HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state);
386 return(1);
387 }
388 if (CudaFailedMsg(cudaThreadSynchronize()))
389 {
390 HLTError("CUDA Error while synchronizing (%s)", state);
391 return(1);
392 }
393 if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
394 return(0);
395}
396
397void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
398{
399 //Set Debug Level and Debug output File if applicable
400 fDebugLevel = dwLevel;
401 if (NewOutFile) fOutFile = NewOutFile;
402}
403
404int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
405{
406 //Set a specific GPU Tracker Option
407 {
408 HLTError("Unknown Option: %s", OptionName);
409 return(1);
410 }
411 //No Options used at the moment
412 //return(0);
413}
414
415#ifdef HLTCA_STANDALONE
416void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
417{
418 //Run Performance Query for timer i of slice iSlice
419 if (fDebugLevel >= 1)
420 {
421 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
422 }
423}
424#else
425void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
426#endif
427
428void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
429{
430 //Dump Rowblocks to File
431 if (fDebugLevel >= 4)
432 {
433 *fOutFile << "RowBlock Tracklets" << std::endl;
434
435 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
436 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
437 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
438 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
439 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
440 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
441 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
442
443 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
444 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
445 {
446 *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
8566066c 447 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
eb08490f 448 for (int j = 0;j < rowBlockPos[i].x;j++)
449 {
450 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
451 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
452 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
453 {
454 HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
455 }
456 k++;
457 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
458 {
459 HLTError("Error, -1 Tracklet found");
460 }
461 }
8566066c 462 *fOutFile << std::endl << "Phase 2: ";
eb08490f 463 for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
464 {
465 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
466 }
8566066c 467 *fOutFile << std::endl;
eb08490f 468 }
469
470 if (check)
471 {
472 *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
473 for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
474 {
475 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
476 }
477 }
478
479 free(rowBlockPos);
480 free(rowBlockTracklets);
481 free(blockStartingTracklet);
482 }
483}
484
485__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
486{
487 //Initialize GPU RowBlocks and HitWeights
488 int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
489 int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
490 const int stride = blockDim.x * gridDim.x;
491 int4 i0, i1;
492 i0.x = i0.y = i0.z = i0.w = 0;
493 i1.x = i1.y = i1.z = i1.w = -1;
494 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
495 RowBlockPos[i] = i0;
496 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
497 rowBlockTracklets4[i] = i1;
498 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
499 sliceDataHitWeights4[i] = i0;
500}
501
502int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
503{
504 //Primary reconstruction function
505 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
506
3a4ab1ca 507 if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
508
509 if (!fCudaInitialized)
510 {
511 HLTError("GPUTracker not initialized");
512 return(1);
513 }
514 if (sliceCountLocal > fSliceCount)
515 {
516 HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
517 return(1);
518 }
73a33d2e 519 if (fThreadId != GetThread())
3a4ab1ca 520 {
73a33d2e 521 HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());
3a4ab1ca 522 return(1);
523 }
524
525 if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
eb08490f 526
527 if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
528 {
529 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
530 return(1);
531 }
532
533 if (fDebugLevel >= 4)
534 {
535 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
536 {
8566066c 537 *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
eb08490f 538 }
539 }
540
541 memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
542
eb08490f 543 if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
544
545#ifdef HLTCA_GPU_TIME_PROFILE
c7d6f78b 546 unsigned __int64 a, b, c, d;
8566066c 547 AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
548 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
eb08490f 549#endif
550
551 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
552 {
553 //Make this a GPU Tracker
554 fGpuTracker[iSlice].SetGPUTracker();
555 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
8566066c 556 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
557 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
eb08490f 558
559 //Set Pointers to GPU Memory
560 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
561
562 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
563 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
564 tmpMem = alignPointer(tmpMem, 1024 * 1024);
565
566 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
567 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
568 tmpMem = alignPointer(tmpMem, 1024 * 1024);
569
570 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
571 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
572 tmpMem = alignPointer(tmpMem, 1024 * 1024);
573
574 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
575 {
576 HLTError("Insufficiant Track Memory");
577 return(1);
578 }
579
580 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
581 {
582 HLTError("Insufficiant Global Memory");
583 return(1);
584 }
585
586 //Initialize Startup Constants
587 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
588 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
589 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
590 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
591 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
592 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
593 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
594 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
8566066c 595 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
eb08490f 596 }
597
598#ifdef HLTCA_GPU_TEXTURE_FETCH
599 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
600 size_t offset;
601 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
602 {
3a4ab1ca 603 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
eb08490f 604 return(1);
605 }
606 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
607 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
608 {
3a4ab1ca 609 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
eb08490f 610 return(1);
611 }
612 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
613 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
614 {
3a4ab1ca 615 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
eb08490f 616 return(1);
617 }
618#endif
619
620 //Copy Tracker Object to GPU Memory
621 if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
622#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
623 if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1);
624 CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000));
625#endif
626 CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
627
628 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
629 {
630 StandalonePerfTime(firstSlice + iSlice, 0);
631
632 //Initialize GPU Slave Tracker
633 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
8566066c 634 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
eb08490f 635#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 636 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
eb08490f 637#endif
638 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
639#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 640 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
eb08490f 641 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
642#endif
643 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
644 {
645 HLTError("Insufficiant Slice Data Memory");
646 return(1);
647 }
648
649 /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice())
650 {
651 if (fDebugLevel >= 3) HLTInfo("Slice Empty, not running GPU Tracker");
652 if (sliceCountLocal == 1)
653 return(0);
654 }*/
655
656 //Initialize temporary memory where needed
657 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
658 PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
659
660 //Copy Data to GPU Global Memory
661 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
662 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
663 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
664
665 if (fDebugLevel >= 4)
666 {
667 if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
444e5682 668 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS);
669 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
eb08490f 670 }
671
672 if (CUDASync("Initialization")) return(1);
673 StandalonePerfTime(firstSlice + iSlice, 1);
674
675 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
676 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
677
678 if (CUDASync("Neighbours finder")) return 1;
679
680 StandalonePerfTime(firstSlice + iSlice, 2);
681
682 if (fDebugLevel >= 4)
683 {
684 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
685 fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
686 }
687
688 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
689 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
690 if (CUDASync("Neighbours Cleaner")) return 1;
691
692 StandalonePerfTime(firstSlice + iSlice, 3);
693
694 if (fDebugLevel >= 4)
695 {
696 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
697 fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
698 }
699
700 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
701 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
702 if (CUDASync("Start Hits Finder")) return 1;
703
704 StandalonePerfTime(firstSlice + iSlice, 4);
705
706 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter");
707 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
708 if (CUDASync("Start Hits Sorter")) return 1;
709
710 StandalonePerfTime(firstSlice + iSlice, 5);
711
712 if (fDebugLevel >= 2)
713 {
714 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
715 if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
716 if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
717 {
718 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
719 return(1);
720 }
721 }
722
723 if (fDebugLevel >= 4)
724 {
725 *fOutFile << "Temporary ";
726 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
727 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
728 uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
729 CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
730 *fOutFile << "Start Hits Sort Vector:" << std::endl;
731 for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
732 {
733 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
734 }
735 free(tmpMemory);
736
737 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
738 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
739 }
740
741 StandalonePerfTime(firstSlice + iSlice, 6);
742
743 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
744 }
745
746 StandalonePerfTime(firstSlice, 7);
747#ifdef HLTCA_GPU_PREFETCHDATA
748 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
749 {
750 if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
751 {
752 HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
753 return(1);
754 }
755 if (fDebugLevel >= 1)
756 {
757 static int infoShown = 0;
758 if (!infoShown)
759 {
760 HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
761 infoShown = 1;
762 }
763 }
764 }
765#endif
766
3a4ab1ca 767 int nHardCollisions = 0;
768
769RestartTrackletConstructor:
eb08490f 770 if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
771 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
772 {
773 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
774 if (CUDASync("Tracklet Initializer")) return 1;
775 DumpRowBlocks(fSlaveTrackers, iSlice);
776 }
777
778 if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
779 AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
780 if (CUDASync("Tracklet Constructor (new)")) return 1;
781
782 StandalonePerfTime(firstSlice, 8);
783
784 if (fDebugLevel >= 4)
785 {
786 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
787 {
788 DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
789 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
790 if (fDebugLevel >= 5)
791 {
792 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
793 }
794 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
795 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
796 fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
797 }
798 }
799
c7d6f78b 800 int runSlices = 0;
801 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
eb08490f 802 {
c7d6f78b 803 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
804 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
805 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
eb08490f 806 }
807 if (CUDASync("Tracklet Selector")) return 1;
808 StandalonePerfTime(firstSlice, 9);
809
810 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0]));
811 for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++)
812 {
813 if (iSliceTmp < sliceCountLocal)
814 {
815 int iSlice = iSliceTmp;
816 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
817 cudaStreamSynchronize(cudaStreams[iSlice]);
818 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
819 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
820 if (iSlice + 1 < sliceCountLocal)
821 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1]));
822 }
823
824 if (iSliceTmp)
825 {
826 int iSlice = iSliceTmp - 1;
827 cudaStreamSynchronize(cudaStreams[iSlice]);
828
829 if (fDebugLevel >= 4)
830 {
831 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
832 fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
833 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
834 }
835
836 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
837 {
3a4ab1ca 838 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION && nHardCollisions++ < 10)
839 {
840 HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");
841 for (int i = 0;i < sliceCountLocal;i++)
842 {
843 cudaThreadSynchronize();
844 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
845 *fSlaveTrackers[firstSlice + i].NTracks() = 0;
846 *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
847 fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
848 CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
849 PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
850 }
851 goto RestartTrackletConstructor;
852 }
eb08490f 853 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
854 return(1);
855 }
856 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
857
858 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
859#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 860 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
eb08490f 861#endif
862 fSlaveTrackers[firstSlice + iSlice].WriteOutput();
863#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 864 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
eb08490f 865 printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
866#endif
867
868 if (fDebugLevel >= 4)
869 {
870 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
871 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
872 }
873 }
874 }
875
876 StandalonePerfTime(firstSlice, 10);
877
878 if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
879
880#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
881 char* stageAtSync = (char*) malloc(100000000);
882 CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
883 cudaFree(fGpuTracker[0].fStageAtSync);
884
885 FILE* fp = fopen("profile.txt", "w+");
886 FILE* fp2 = fopen("profile.bmp", "w+b");
887 int nEmptySync = 0, fEmpty;
888
889 const int bmpheight = 1000;
890 BITMAPFILEHEADER bmpFH;
891 BITMAPINFOHEADER bmpIH;
892 ZeroMemory(&bmpFH, sizeof(bmpFH));
893 ZeroMemory(&bmpIH, sizeof(bmpIH));
894
895 bmpFH.bfType = 19778; //"BM"
896 bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
897 bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
898
899 bmpIH.biSize = sizeof(bmpIH);
900 bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
901 bmpIH.biHeight = bmpheight;
902 bmpIH.biPlanes = 1;
903 bmpIH.biBitCount = 32;
904
905 fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
906 fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
907
908 for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
909 {
910 fEmpty = 1;
911 for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
912 {
913 fprintf(fp, "%d\t", stageAtSync[i + j]);
914 int color = 0;
915 if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
916 if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
917 if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
918 if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
919 fwrite(&color, 1, sizeof(int), fp2);
920 if (j > 0 && j % 32 == 0)
921 {
922 color = RGB(255, 255, 255);
923 fwrite(&color, 1, 4, fp2);
924 }
925 if (stageAtSync[i + j]) fEmpty = 0;
926 }
927 fprintf(fp, "\n");
928 if (fEmpty) nEmptySync++;
929 else nEmptySync = 0;
930 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
931 }
932
933 fclose(fp);
934 fclose(fp2);
935 free(stageAtSync);
936#endif
937
938 return(0);
939}
940
941int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
942{
943 //Initialize Slice Tracker Parameter for a slave tracker
944 fSlaveTrackers[iSlice].Initialize(param);
945 if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
946 {
947 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
948 return(1);
949 }
950 return(0);
951}
952
953int AliHLTTPCCAGPUTracker::ExitGPU()
954{
955 //Uninitialize CUDA
956 cudaThreadSynchronize();
957 if (fGPUMemory)
958 {
959 cudaFree(fGPUMemory);
960 fGPUMemory = NULL;
961 }
962 if (fHostLockedMemory)
963 {
964 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
965 {
966 cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
967 }
968 free(fpCudaStreams);
969 fGpuTracker = NULL;
970 cudaFreeHost(fHostLockedMemory);
971 }
972
973 if (CudaFailedMsg(cudaThreadExit()))
974 {
975 HLTError("Could not uninitialize GPU");
976 return(1);
977 }
978 HLTInfo("CUDA Uninitialized");
979 fgGPUUsed = false;
3a4ab1ca 980 fCudaInitialized = 0;
eb08490f 981 return(0);
982}
983
984void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
985{
444e5682 986 //Set Output Control Pointers
eb08490f 987 fOutputControl = val;
988 for (int i = 0;i < fgkNSlices;i++)
989 {
990 fSlaveTrackers[i].SetOutputControl(val);
991 }
992}
993
73a33d2e 994int AliHLTTPCCAGPUTracker::GetThread()
995{
444e5682 996 //Get Thread ID
73a33d2e 997#ifdef R__WIN32
998 return((int) (size_t) GetCurrentThread());
999#else
1000 return((int) syscall (SYS_gettid));
1001#endif
1002}
1003
8566066c 1004#endif