]>
Commit | Line | Data |
---|---|---|
eb08490f | 1 | // ************************************************************************** |
2 | // This file is property of and copyright by the ALICE HLT Project * | |
3 | // ALICE Experiment at CERN, All rights reserved. * | |
4 | // * | |
5 | // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> * | |
6 | // Ivan Kisel <kisel@kip.uni-heidelberg.de> * | |
7 | // David Rohr <drohr@kip.uni-heidelberg.de> * | |
8 | // for The ALICE HLT Project. * | |
9 | // * | |
10 | // Permission to use, copy, modify and distribute this software and its * | |
11 | // documentation strictly for non-commercial purposes is hereby granted * | |
12 | // without fee, provided that the above copyright notice appears in all * | |
13 | // copies and that both the copyright notice and this permission notice * | |
14 | // appear in the supporting documentation. The authors make no claims * | |
15 | // about the suitability of this software for any purpose. It is * | |
16 | // provided "as is" without express or implied warranty. * | |
17 | // * | |
18 | //*************************************************************************** | |
19 | ||
31649d4b | 20 | #include "AliHLTTPCCAGPUTrackerNVCC.h" |
eb08490f | 21 | |
31649d4b | 22 | #ifdef HLTCA_GPUCODE |
3a4ab1ca | 23 | #include <cuda.h> |
31649d4b | 24 | #include <sm_11_atomic_functions.h> |
25 | #include <sm_12_atomic_functions.h> | |
26 | #endif | |
27 | ||
73a33d2e | 28 | #ifdef R__WIN32 |
73a33d2e | 29 | #else |
3a4ab1ca | 30 | #include <sys/syscall.h> |
444e5682 | 31 | #include <semaphore.h> |
32 | #include <fcntl.h> | |
73a33d2e | 33 | #endif |
3a4ab1ca | 34 | |
eb08490f | 35 | #include "AliHLTTPCCADef.h" |
36 | #include "AliHLTTPCCAGPUConfig.h" | |
37 | ||
eb08490f | 38 | |
39 | #include <iostream> | |
40 | ||
41 | //Disable assertions since they produce errors in GPU Code | |
42 | #ifdef assert | |
43 | #undef assert | |
44 | #endif | |
45 | #define assert(param) | |
46 | ||
47 | __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )]; | |
48 | #ifdef HLTCA_GPU_TEXTURE_FETCH | |
49 | texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2; | |
50 | texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu; | |
51 | texture<signed short, 1, cudaReadModeElementType> gAliTexRefs; | |
52 | #endif | |
53 | ||
eb08490f | 54 | //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code |
55 | #include "AliHLTTPCCATrackParam.cxx" | |
56 | #include "AliHLTTPCCATrack.cxx" | |
57 | ||
eb08490f | 58 | #include "AliHLTTPCCAHitArea.cxx" |
59 | #include "AliHLTTPCCAGrid.cxx" | |
60 | #include "AliHLTTPCCARow.cxx" | |
61 | #include "AliHLTTPCCAParam.cxx" | |
62 | #include "AliHLTTPCCATracker.cxx" | |
63 | ||
eb08490f | 64 | #include "AliHLTTPCCAProcess.h" |
65 | ||
8566066c | 66 | #include "AliHLTTPCCATrackletSelector.cxx" |
eb08490f | 67 | #include "AliHLTTPCCANeighboursFinder.cxx" |
eb08490f | 68 | #include "AliHLTTPCCANeighboursCleaner.cxx" |
69 | #include "AliHLTTPCCAStartHitsFinder.cxx" | |
70 | #include "AliHLTTPCCAStartHitsSorter.cxx" | |
71 | #include "AliHLTTPCCATrackletConstructor.cxx" | |
72 | #include "AliHLTTPCCASliceOutput.cxx" | |
73 | ||
74 | #include "MemoryAssignmentHelpers.h" | |
75 | ||
76 | #ifndef HLTCA_STANDALONE | |
77 | #include "AliHLTDefinitions.h" | |
78 | #include "AliHLTSystem.h" | |
79 | #endif | |
80 | ||
31649d4b | 81 | ClassImp( AliHLTTPCCAGPUTrackerNVCC ) |
eb08490f | 82 | |
31649d4b | 83 | bool AliHLTTPCCAGPUTrackerNVCC::fgGPUUsed = false; |
eb08490f | 84 | |
444e5682 | 85 | #define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem" |
86 | ||
31649d4b | 87 | AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() : |
88 | fGpuTracker(NULL), | |
89 | fGPUMemory(NULL), | |
90 | fHostLockedMemory(NULL), | |
91 | fDebugLevel(0), | |
92 | fOutFile(NULL), | |
93 | fGPUMemSize(0), | |
94 | fpCudaStreams(NULL), | |
95 | fSliceCount(0), | |
96 | fOutputControl(NULL), | |
97 | fThreadId(0), | |
98 | fCudaInitialized(0) | |
99 | {}; | |
100 | ||
101 | AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() {}; | |
102 | ||
103 | void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem) | |
444e5682 | 104 | { |
105 | //Release the global named semaphore that locks GPU Initialization | |
106 | #ifdef R__WIN32 | |
107 | HANDLE* h = (HANDLE*) sem; | |
108 | ReleaseSemaphore(*h, 1, NULL); | |
109 | CloseHandle(*h); | |
110 | delete h; | |
111 | #else | |
112 | sem_t* pSem = (sem_t*) sem; | |
113 | sem_post(pSem); | |
114 | sem_unlink(SemLockName); | |
115 | #endif | |
116 | } | |
117 | ||
31649d4b | 118 | int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount) |
444e5682 | 119 | { |
120 | //Check constants for correct memory sizes | |
121 | if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY) | |
122 | { | |
123 | HLTError("Insufficiant Tracker Object Memory"); | |
124 | return(1); | |
125 | } | |
126 | ||
127 | if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY) | |
128 | { | |
129 | HLTError("Insufficiant Common Memory"); | |
130 | return(1); | |
131 | } | |
132 | ||
133 | if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY) | |
134 | { | |
135 | HLTError("Insufficiant Row Memory"); | |
136 | return(1); | |
137 | } | |
138 | return(0); | |
139 | } | |
140 | ||
31649d4b | 141 | int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID) |
eb08490f | 142 | { |
143 | //Find best CUDA device, initialize and allocate memory | |
444e5682 | 144 | |
145 | if (CheckMemorySizes(sliceCount)) return(1); | |
146 | ||
147 | #ifdef R__WIN32 | |
148 | HANDLE* semLock = new HANDLE; | |
149 | *semLock = CreateSemaphore(NULL, 1, 1, SemLockName); | |
150 | if (*semLock == NULL) | |
151 | { | |
152 | HLTError("Error creating GPUInit Semaphore"); | |
153 | return(1); | |
154 | } | |
155 | WaitForSingleObject(*semLock, INFINITE); | |
156 | #else | |
157 | sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1); | |
158 | if (semLock == SEM_FAILED) | |
159 | { | |
160 | HLTError("Error creating GPUInit Semaphore"); | |
161 | return(1); | |
162 | } | |
163 | sem_wait(semLock); | |
164 | #endif | |
165 | ||
eb08490f | 166 | if (fgGPUUsed) |
167 | { | |
168 | HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process"); | |
444e5682 | 169 | ReleaseGlobalLock(semLock); |
eb08490f | 170 | return(1); |
171 | } | |
3a4ab1ca | 172 | fgGPUUsed = 1; |
73a33d2e | 173 | fThreadId = GetThread(); |
eb08490f | 174 | |
175 | cudaDeviceProp fCudaDeviceProp; | |
176 | ||
3a4ab1ca | 177 | fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY); |
178 | ||
eb08490f | 179 | #ifndef CUDA_DEVICE_EMULATION |
3a4ab1ca | 180 | int count, bestDevice = -1; |
181 | long long int bestDeviceSpeed = 0, deviceSpeed; | |
eb08490f | 182 | if (CudaFailedMsg(cudaGetDeviceCount(&count))) |
183 | { | |
184 | HLTError("Error getting CUDA Device Count"); | |
3a4ab1ca | 185 | fgGPUUsed = 0; |
444e5682 | 186 | ReleaseGlobalLock(semLock); |
eb08490f | 187 | return(1); |
188 | } | |
3a4ab1ca | 189 | if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:"); |
eb08490f | 190 | for (int i = 0;i < count;i++) |
191 | { | |
3a4ab1ca | 192 | unsigned int free, total; |
193 | cuInit(0); | |
194 | CUdevice tmpDevice; | |
195 | cuDeviceGet(&tmpDevice, i); | |
196 | CUcontext tmpContext; | |
197 | cuCtxCreate(&tmpContext, 0, tmpDevice); | |
198 | if(cuMemGetInfo(&free, &total)) std::cout << "Error\n"; | |
199 | cuCtxDestroy(tmpContext); | |
200 | CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i)); | |
201 | ||
202 | int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize; | |
203 | ||
31649d4b | 204 | if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]"); |
3a4ab1ca | 205 | deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free; |
206 | if (deviceOK && deviceSpeed > bestDeviceSpeed) | |
eb08490f | 207 | { |
208 | bestDevice = i; | |
3a4ab1ca | 209 | bestDeviceSpeed = deviceSpeed; |
eb08490f | 210 | } |
211 | } | |
eb08490f | 212 | if (bestDevice == -1) |
213 | { | |
214 | HLTWarning("No CUDA Device available, aborting CUDA Initialisation"); | |
3a4ab1ca | 215 | fgGPUUsed = 0; |
444e5682 | 216 | ReleaseGlobalLock(semLock); |
eb08490f | 217 | return(1); |
218 | } | |
219 | ||
220 | int cudaDevice; | |
221 | if (forceDeviceID == -1) | |
222 | cudaDevice = bestDevice; | |
223 | else | |
224 | cudaDevice = forceDeviceID; | |
225 | #else | |
226 | int cudaDevice = 0; | |
227 | #endif | |
228 | ||
229 | cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice ); | |
230 | ||
231 | if (fDebugLevel >= 1) | |
232 | { | |
3a4ab1ca | 233 | HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name); |
31649d4b | 234 | HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem); |
235 | HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock); | |
3a4ab1ca | 236 | HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock); |
237 | HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize); | |
31649d4b | 238 | HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch); |
3a4ab1ca | 239 | HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock); |
240 | HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]); | |
241 | HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]); | |
31649d4b | 242 | HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem); |
3a4ab1ca | 243 | HLTInfo("major = %d", fCudaDeviceProp.major); |
244 | HLTInfo("minor = %d", fCudaDeviceProp.minor); | |
245 | HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate); | |
31649d4b | 246 | HLTInfo("textureAlignment %lld= ", (unsigned long long int) fCudaDeviceProp.textureAlignment); |
eb08490f | 247 | } |
248 | ||
249 | if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) | |
250 | { | |
251 | HLTError( "Unsupported CUDA Device" ); | |
3a4ab1ca | 252 | fgGPUUsed = 0; |
444e5682 | 253 | ReleaseGlobalLock(semLock); |
3a4ab1ca | 254 | return(1); |
eb08490f | 255 | } |
256 | ||
257 | if (CudaFailedMsg(cudaSetDevice(cudaDevice))) | |
258 | { | |
259 | HLTError("Could not set CUDA Device!"); | |
3a4ab1ca | 260 | fgGPUUsed = 0; |
444e5682 | 261 | ReleaseGlobalLock(semLock); |
eb08490f | 262 | return(1); |
263 | } | |
264 | ||
eb08490f | 265 | if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize))) |
266 | { | |
267 | HLTError("CUDA Memory Allocation Error"); | |
268 | cudaThreadExit(); | |
3a4ab1ca | 269 | fgGPUUsed = 0; |
444e5682 | 270 | ReleaseGlobalLock(semLock); |
eb08490f | 271 | return(1); |
272 | } | |
444e5682 | 273 | ReleaseGlobalLock(semLock); |
eb08490f | 274 | if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize); |
275 | int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY; | |
276 | if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize))) | |
277 | { | |
278 | cudaFree(fGPUMemory); | |
279 | cudaThreadExit(); | |
280 | HLTError("Error allocating Page Locked Host Memory"); | |
3a4ab1ca | 281 | fgGPUUsed = 0; |
eb08490f | 282 | return(1); |
283 | } | |
284 | if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize); | |
285 | ||
286 | if (fDebugLevel >= 1) | |
287 | { | |
288 | CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize)); | |
289 | } | |
eb08490f | 290 | |
eb08490f | 291 | fSliceCount = sliceCount; |
3a4ab1ca | 292 | //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory |
eb08490f | 293 | fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0); |
294 | ||
295 | for (int i = 0;i < fgkNSlices;i++) | |
296 | { | |
297 | fSlaveTrackers[i].SetGPUTracker(); | |
298 | fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i)); | |
8566066c | 299 | fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i)); |
eb08490f | 300 | } |
301 | ||
302 | fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t)); | |
303 | cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams; | |
304 | for (int i = 0;i < CAMath::Max(3, fSliceCount);i++) | |
305 | { | |
306 | if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i]))) | |
307 | { | |
3a4ab1ca | 308 | cudaFree(fGPUMemory); |
309 | cudaFreeHost(fHostLockedMemory); | |
310 | cudaThreadExit(); | |
311 | HLTError("Error creating CUDA Stream"); | |
312 | fgGPUUsed = 0; | |
313 | return(1); | |
eb08490f | 314 | } |
315 | } | |
316 | ||
3a4ab1ca | 317 | fCudaInitialized = 1; |
31649d4b | 318 | HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d)", cudaDevice, fCudaDeviceProp.name, fThreadId); |
3a4ab1ca | 319 | |
eb08490f | 320 | #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION) |
321 | if (fDebugLevel < 2) | |
322 | { | |
323 | //Do one initial run for Benchmark reasons | |
324 | const int useDebugLevel = fDebugLevel; | |
325 | fDebugLevel = 0; | |
8566066c | 326 | AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount]; |
eb08490f | 327 | |
328 | std::ifstream fin; | |
eb08490f | 329 | |
eb08490f | 330 | AliHLTTPCCAParam tmpParam; |
331 | AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl; | |
8566066c | 332 | |
333 | fin.open("events/settings.dump"); | |
334 | int tmpCount; | |
335 | fin >> tmpCount; | |
336 | for (int i = 0;i < sliceCount;i++) | |
337 | { | |
338 | fSlaveTrackers[i].SetOutputControl(&tmpOutputControl); | |
339 | tmpParam.ReadSettings(fin); | |
340 | InitializeSliceParam(i, tmpParam); | |
341 | } | |
342 | fin.close(); | |
343 | ||
344 | fin.open("eventspbpbc/event.0.dump", std::ifstream::binary); | |
345 | for (int i = 0;i < sliceCount;i++) | |
346 | { | |
347 | tmpCluster[i].StartReading(i, 0); | |
348 | tmpCluster[i].ReadEvent(fin); | |
349 | tmpCluster[i].FinishReading(); | |
350 | } | |
351 | fin.close(); | |
352 | ||
353 | AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount]; | |
354 | memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*)); | |
355 | ||
356 | Reconstruct(tmpOutput, tmpCluster, 0, sliceCount); | |
357 | for (int i = 0;i < sliceCount;i++) | |
358 | { | |
359 | free(tmpOutput[i]); | |
360 | tmpOutput[i] = NULL; | |
361 | fSlaveTrackers[i].SetOutputControl(NULL); | |
362 | } | |
363 | delete[] tmpOutput; | |
364 | delete[] tmpCluster; | |
eb08490f | 365 | fDebugLevel = useDebugLevel; |
366 | } | |
367 | #endif | |
eb08490f | 368 | return(0); |
369 | } | |
370 | ||
31649d4b | 371 | template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment) |
eb08490f | 372 | { |
373 | //Macro to align Pointers. | |
374 | //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker | |
375 | //(As long as every single data structure is <= 1 MB) | |
376 | ||
377 | size_t adr = (size_t) ptr; | |
378 | if (adr % alignment) | |
379 | { | |
380 | adr += alignment - (adr % alignment); | |
381 | } | |
382 | return((T*) adr); | |
383 | } | |
384 | ||
31649d4b | 385 | bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsg(cudaError_t error) |
eb08490f | 386 | { |
387 | //Check for CUDA Error and in the case of an error display the corresponding error string | |
388 | if (error == cudaSuccess) return(false); | |
389 | HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error)); | |
390 | return(true); | |
391 | } | |
392 | ||
31649d4b | 393 | int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state) |
eb08490f | 394 | { |
395 | //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards | |
396 | ||
397 | if (fDebugLevel == 0) return(0); | |
398 | cudaError cuErr; | |
399 | cuErr = cudaGetLastError(); | |
400 | if (cuErr != cudaSuccess) | |
401 | { | |
402 | HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state); | |
403 | return(1); | |
404 | } | |
405 | if (CudaFailedMsg(cudaThreadSynchronize())) | |
406 | { | |
407 | HLTError("CUDA Error while synchronizing (%s)", state); | |
408 | return(1); | |
409 | } | |
410 | if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done"); | |
411 | return(0); | |
412 | } | |
413 | ||
31649d4b | 414 | void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile) |
eb08490f | 415 | { |
416 | //Set Debug Level and Debug output File if applicable | |
417 | fDebugLevel = dwLevel; | |
418 | if (NewOutFile) fOutFile = NewOutFile; | |
419 | } | |
420 | ||
31649d4b | 421 | int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/) |
eb08490f | 422 | { |
423 | //Set a specific GPU Tracker Option | |
424 | { | |
425 | HLTError("Unknown Option: %s", OptionName); | |
426 | return(1); | |
427 | } | |
428 | //No Options used at the moment | |
429 | //return(0); | |
430 | } | |
431 | ||
432 | #ifdef HLTCA_STANDALONE | |
31649d4b | 433 | void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i) |
eb08490f | 434 | { |
435 | //Run Performance Query for timer i of slice iSlice | |
436 | if (fDebugLevel >= 1) | |
437 | { | |
438 | AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i)); | |
439 | } | |
440 | } | |
441 | #else | |
31649d4b | 442 | void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {} |
eb08490f | 443 | #endif |
444 | ||
31649d4b | 445 | void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check) |
eb08490f | 446 | { |
447 | //Dump Rowblocks to File | |
448 | if (fDebugLevel >= 4) | |
449 | { | |
450 | *fOutFile << "RowBlock Tracklets" << std::endl; | |
451 | ||
452 | int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2); | |
453 | int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2); | |
454 | uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT); | |
455 | CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost)); | |
456 | CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost)); | |
457 | CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost)); | |
458 | CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); | |
459 | ||
460 | int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet; | |
461 | for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++) | |
462 | { | |
463 | *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << | |
8566066c | 464 | rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: "; |
eb08490f | 465 | for (int j = 0;j < rowBlockPos[i].x;j++) |
466 | { | |
467 | //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem! | |
468 | *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", "; | |
469 | if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k) | |
470 | { | |
471 | HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k); | |
472 | } | |
473 | k++; | |
474 | if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1) | |
475 | { | |
476 | HLTError("Error, -1 Tracklet found"); | |
477 | } | |
478 | } | |
8566066c | 479 | *fOutFile << std::endl << "Phase 2: "; |
eb08490f | 480 | for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++) |
481 | { | |
482 | *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", "; | |
483 | } | |
8566066c | 484 | *fOutFile << std::endl; |
eb08490f | 485 | } |
486 | ||
487 | if (check) | |
488 | { | |
489 | *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl; | |
490 | for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++) | |
491 | { | |
492 | *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl; | |
493 | } | |
494 | } | |
495 | ||
496 | free(rowBlockPos); | |
497 | free(rowBlockTracklets); | |
498 | free(blockStartingTracklet); | |
499 | } | |
500 | } | |
501 | ||
502 | __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits) | |
503 | { | |
504 | //Initialize GPU RowBlocks and HitWeights | |
505 | int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets; | |
506 | int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights; | |
507 | const int stride = blockDim.x * gridDim.x; | |
508 | int4 i0, i1; | |
509 | i0.x = i0.y = i0.z = i0.w = 0; | |
510 | i1.x = i1.y = i1.z = i1.w = -1; | |
511 | for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride) | |
512 | RowBlockPos[i] = i0; | |
513 | for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride) | |
514 | rowBlockTracklets4[i] = i1; | |
515 | for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride) | |
516 | sliceDataHitWeights4[i] = i0; | |
517 | } | |
518 | ||
31649d4b | 519 | int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal) |
eb08490f | 520 | { |
521 | //Primary reconstruction function | |
522 | cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams; | |
523 | ||
3a4ab1ca | 524 | if (sliceCountLocal == -1) sliceCountLocal = fSliceCount; |
525 | ||
526 | if (!fCudaInitialized) | |
527 | { | |
528 | HLTError("GPUTracker not initialized"); | |
529 | return(1); | |
530 | } | |
531 | if (sliceCountLocal > fSliceCount) | |
532 | { | |
533 | HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal); | |
534 | return(1); | |
535 | } | |
73a33d2e | 536 | if (fThreadId != GetThread()) |
3a4ab1ca | 537 | { |
73a33d2e | 538 | HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread()); |
3a4ab1ca | 539 | return(1); |
540 | } | |
541 | ||
542 | if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal); | |
eb08490f | 543 | |
544 | if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM) | |
545 | { | |
546 | HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData)); | |
547 | return(1); | |
548 | } | |
549 | ||
550 | if (fDebugLevel >= 4) | |
551 | { | |
552 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
553 | { | |
8566066c | 554 | *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl; |
eb08490f | 555 | } |
556 | } | |
557 | ||
558 | memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal); | |
559 | ||
eb08490f | 560 | if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants"); |
561 | ||
562 | #ifdef HLTCA_GPU_TIME_PROFILE | |
c7d6f78b | 563 | unsigned __int64 a, b, c, d; |
8566066c | 564 | AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c); |
565 | AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d); | |
eb08490f | 566 | #endif |
567 | ||
568 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
569 | { | |
570 | //Make this a GPU Tracker | |
571 | fGpuTracker[iSlice].SetGPUTracker(); | |
572 | fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice)); | |
8566066c | 573 | fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice)); |
574 | fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false); | |
eb08490f | 575 | |
576 | //Set Pointers to GPU Memory | |
577 | char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice); | |
578 | ||
579 | if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory"); | |
580 | tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters()); | |
581 | tmpMem = alignPointer(tmpMem, 1024 * 1024); | |
582 | ||
583 | if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory"); | |
584 | tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/); | |
585 | tmpMem = alignPointer(tmpMem, 1024 * 1024); | |
586 | ||
587 | if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory"); | |
588 | tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters()); | |
589 | tmpMem = alignPointer(tmpMem, 1024 * 1024); | |
590 | ||
591 | if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY) | |
592 | { | |
593 | HLTError("Insufficiant Track Memory"); | |
594 | return(1); | |
595 | } | |
596 | ||
597 | if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY) | |
598 | { | |
599 | HLTError("Insufficiant Global Memory"); | |
600 | return(1); | |
601 | } | |
602 | ||
603 | //Initialize Startup Constants | |
604 | *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0; | |
605 | *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0; | |
606 | *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0; | |
607 | fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal; | |
608 | if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount); | |
609 | fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice; | |
610 | fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal; | |
611 | fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0; | |
8566066c | 612 | fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory()); |
eb08490f | 613 | } |
614 | ||
615 | #ifdef HLTCA_GPU_TEXTURE_FETCH | |
616 | cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>(); | |
617 | size_t offset; | |
618 | if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) | |
619 | { | |
3a4ab1ca | 620 | HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset); |
eb08490f | 621 | return(1); |
622 | } | |
623 | cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>(); | |
624 | if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) | |
625 | { | |
3a4ab1ca | 626 | HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset); |
eb08490f | 627 | return(1); |
628 | } | |
629 | cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>(); | |
630 | if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) | |
631 | { | |
3a4ab1ca | 632 | HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset); |
eb08490f | 633 | return(1); |
634 | } | |
635 | #endif | |
636 | ||
637 | //Copy Tracker Object to GPU Memory | |
638 | if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU"); | |
639 | #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE | |
640 | if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1); | |
641 | CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000)); | |
642 | #endif | |
643 | CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0])); | |
644 | ||
645 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
646 | { | |
647 | StandalonePerfTime(firstSlice + iSlice, 0); | |
648 | ||
649 | //Initialize GPU Slave Tracker | |
650 | if (fDebugLevel >= 3) HLTInfo("Creating Slice Data"); | |
8566066c | 651 | fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice)); |
eb08490f | 652 | #ifdef HLTCA_GPU_TIME_PROFILE |
8566066c | 653 | AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a); |
eb08490f | 654 | #endif |
655 | fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]); | |
31649d4b | 656 | |
657 | if (fDebugLevel >= 4) | |
658 | { | |
659 | *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl; | |
660 | *fOutFile << "Slice Data:" << std::endl; | |
661 | fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile); | |
662 | } | |
663 | ||
eb08490f | 664 | #ifdef HLTCA_GPU_TIME_PROFILE |
8566066c | 665 | AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b); |
eb08490f | 666 | printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c); |
667 | #endif | |
668 | if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY) | |
669 | { | |
670 | HLTError("Insufficiant Slice Data Memory"); | |
671 | return(1); | |
672 | } | |
673 | ||
674 | /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice()) | |
675 | { | |
676 | if (fDebugLevel >= 3) HLTInfo("Slice Empty, not running GPU Tracker"); | |
677 | if (sliceCountLocal == 1) | |
678 | return(0); | |
679 | }*/ | |
680 | ||
681 | //Initialize temporary memory where needed | |
682 | if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory"); | |
683 | PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign()); | |
684 | ||
685 | //Copy Data to GPU Global Memory | |
686 | CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); | |
687 | CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); | |
688 | CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); | |
689 | ||
690 | if (fDebugLevel >= 4) | |
691 | { | |
692 | if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory"); | |
444e5682 | 693 | fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS); |
694 | fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() ); | |
eb08490f | 695 | } |
696 | ||
697 | if (CUDASync("Initialization")) return(1); | |
698 | StandalonePerfTime(firstSlice + iSlice, 1); | |
699 | ||
700 | if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder"); | |
701 | AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice); | |
702 | ||
703 | if (CUDASync("Neighbours finder")) return 1; | |
704 | ||
705 | StandalonePerfTime(firstSlice + iSlice, 2); | |
706 | ||
707 | if (fDebugLevel >= 4) | |
708 | { | |
709 | CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost)); | |
710 | fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile); | |
711 | } | |
712 | ||
713 | if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner"); | |
714 | AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice); | |
715 | if (CUDASync("Neighbours Cleaner")) return 1; | |
716 | ||
717 | StandalonePerfTime(firstSlice + iSlice, 3); | |
718 | ||
719 | if (fDebugLevel >= 4) | |
720 | { | |
721 | CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost)); | |
722 | fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile); | |
723 | } | |
724 | ||
725 | if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder"); | |
726 | AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice); | |
727 | if (CUDASync("Start Hits Finder")) return 1; | |
728 | ||
729 | StandalonePerfTime(firstSlice + iSlice, 4); | |
730 | ||
731 | if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter"); | |
732 | AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice); | |
733 | if (CUDASync("Start Hits Sorter")) return 1; | |
734 | ||
735 | StandalonePerfTime(firstSlice + iSlice, 5); | |
736 | ||
737 | if (fDebugLevel >= 2) | |
738 | { | |
739 | CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); | |
740 | if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets()); | |
741 | if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS) | |
742 | { | |
743 | HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant"); | |
744 | return(1); | |
745 | } | |
746 | } | |
747 | ||
748 | if (fDebugLevel >= 4) | |
749 | { | |
750 | *fOutFile << "Temporary "; | |
751 | CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost)); | |
752 | fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile); | |
753 | uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows()); | |
754 | CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost)); | |
755 | *fOutFile << "Start Hits Sort Vector:" << std::endl; | |
756 | for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++) | |
757 | { | |
758 | *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl; | |
759 | } | |
760 | free(tmpMemory); | |
761 | ||
762 | CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost)); | |
763 | fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile); | |
764 | } | |
765 | ||
766 | StandalonePerfTime(firstSlice + iSlice, 6); | |
767 | ||
768 | fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters()); | |
769 | } | |
770 | ||
771 | StandalonePerfTime(firstSlice, 7); | |
772 | #ifdef HLTCA_GPU_PREFETCHDATA | |
773 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
774 | { | |
775 | if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4)) | |
776 | { | |
777 | HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4)); | |
778 | return(1); | |
779 | } | |
780 | if (fDebugLevel >= 1) | |
781 | { | |
782 | static int infoShown = 0; | |
783 | if (!infoShown) | |
784 | { | |
785 | HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v)); | |
786 | infoShown = 1; | |
787 | } | |
788 | } | |
789 | } | |
790 | #endif | |
791 | ||
3a4ab1ca | 792 | int nHardCollisions = 0; |
793 | ||
794 | RestartTrackletConstructor: | |
eb08490f | 795 | if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler"); |
796 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
797 | { | |
798 | AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice); | |
799 | if (CUDASync("Tracklet Initializer")) return 1; | |
800 | DumpRowBlocks(fSlaveTrackers, iSlice); | |
801 | } | |
802 | ||
803 | if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor"); | |
804 | AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>(); | |
805 | if (CUDASync("Tracklet Constructor (new)")) return 1; | |
806 | ||
807 | StandalonePerfTime(firstSlice, 8); | |
808 | ||
809 | if (fDebugLevel >= 4) | |
810 | { | |
811 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
812 | { | |
813 | DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false); | |
814 | CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); | |
815 | if (fDebugLevel >= 5) | |
816 | { | |
817 | HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets()); | |
818 | } | |
819 | CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost)); | |
820 | CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost)); | |
821 | fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile); | |
822 | } | |
823 | } | |
824 | ||
c7d6f78b | 825 | int runSlices = 0; |
826 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices) | |
eb08490f | 827 | { |
c7d6f78b | 828 | if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++; |
829 | if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices); | |
830 | AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice)); | |
eb08490f | 831 | } |
832 | if (CUDASync("Tracklet Selector")) return 1; | |
833 | StandalonePerfTime(firstSlice, 9); | |
834 | ||
835 | CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0])); | |
836 | for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++) | |
837 | { | |
838 | if (iSliceTmp < sliceCountLocal) | |
839 | { | |
840 | int iSlice = iSliceTmp; | |
841 | if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host"); | |
842 | cudaStreamSynchronize(cudaStreams[iSlice]); | |
843 | CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice])); | |
844 | CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice])); | |
845 | if (iSlice + 1 < sliceCountLocal) | |
846 | CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1])); | |
847 | } | |
848 | ||
849 | if (iSliceTmp) | |
850 | { | |
851 | int iSlice = iSliceTmp - 1; | |
852 | cudaStreamSynchronize(cudaStreams[iSlice]); | |
853 | ||
854 | if (fDebugLevel >= 4) | |
855 | { | |
856 | CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost)); | |
857 | fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile); | |
858 | fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile); | |
859 | } | |
860 | ||
861 | if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError) | |
862 | { | |
3a4ab1ca | 863 | if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION && nHardCollisions++ < 10) |
864 | { | |
865 | HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor"); | |
866 | for (int i = 0;i < sliceCountLocal;i++) | |
867 | { | |
868 | cudaThreadSynchronize(); | |
869 | CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost)); | |
870 | *fSlaveTrackers[firstSlice + i].NTracks() = 0; | |
871 | *fSlaveTrackers[firstSlice + i].NTrackHits() = 0; | |
872 | fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE; | |
873 | CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice)); | |
874 | PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign()); | |
875 | } | |
876 | goto RestartTrackletConstructor; | |
877 | } | |
eb08490f | 878 | HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError); |
879 | return(1); | |
880 | } | |
881 | if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits()); | |
882 | ||
883 | fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]); | |
884 | #ifdef HLTCA_GPU_TIME_PROFILE | |
8566066c | 885 | AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a); |
eb08490f | 886 | #endif |
887 | fSlaveTrackers[firstSlice + iSlice].WriteOutput(); | |
888 | #ifdef HLTCA_GPU_TIME_PROFILE | |
8566066c | 889 | AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b); |
eb08490f | 890 | printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c); |
891 | #endif | |
892 | ||
893 | if (fDebugLevel >= 4) | |
894 | { | |
895 | delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory(); | |
896 | delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory(); | |
897 | } | |
898 | } | |
899 | } | |
900 | ||
901 | StandalonePerfTime(firstSlice, 10); | |
902 | ||
903 | if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished"); | |
904 | ||
905 | #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE | |
906 | char* stageAtSync = (char*) malloc(100000000); | |
907 | CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost)); | |
908 | cudaFree(fGpuTracker[0].fStageAtSync); | |
909 | ||
910 | FILE* fp = fopen("profile.txt", "w+"); | |
911 | FILE* fp2 = fopen("profile.bmp", "w+b"); | |
912 | int nEmptySync = 0, fEmpty; | |
913 | ||
914 | const int bmpheight = 1000; | |
915 | BITMAPFILEHEADER bmpFH; | |
916 | BITMAPINFOHEADER bmpIH; | |
917 | ZeroMemory(&bmpFH, sizeof(bmpFH)); | |
918 | ZeroMemory(&bmpIH, sizeof(bmpIH)); | |
919 | ||
920 | bmpFH.bfType = 19778; //"BM" | |
921 | bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ; | |
922 | bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH); | |
923 | ||
924 | bmpIH.biSize = sizeof(bmpIH); | |
925 | bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1; | |
926 | bmpIH.biHeight = bmpheight; | |
927 | bmpIH.biPlanes = 1; | |
928 | bmpIH.biBitCount = 32; | |
929 | ||
930 | fwrite(&bmpFH, 1, sizeof(bmpFH), fp2); | |
931 | fwrite(&bmpIH, 1, sizeof(bmpIH), fp2); | |
932 | ||
933 | for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT) | |
934 | { | |
935 | fEmpty = 1; | |
936 | for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++) | |
937 | { | |
938 | fprintf(fp, "%d\t", stageAtSync[i + j]); | |
939 | int color = 0; | |
940 | if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0); | |
941 | if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0); | |
942 | if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255); | |
943 | if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0); | |
944 | fwrite(&color, 1, sizeof(int), fp2); | |
945 | if (j > 0 && j % 32 == 0) | |
946 | { | |
947 | color = RGB(255, 255, 255); | |
948 | fwrite(&color, 1, 4, fp2); | |
949 | } | |
950 | if (stageAtSync[i + j]) fEmpty = 0; | |
951 | } | |
952 | fprintf(fp, "\n"); | |
953 | if (fEmpty) nEmptySync++; | |
954 | else nEmptySync = 0; | |
955 | //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break; | |
956 | } | |
957 | ||
958 | fclose(fp); | |
959 | fclose(fp2); | |
960 | free(stageAtSync); | |
961 | #endif | |
962 | ||
963 | return(0); | |
964 | } | |
965 | ||
31649d4b | 966 | int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m) |
eb08490f | 967 | { |
968 | //Initialize Slice Tracker Parameter for a slave tracker | |
969 | fSlaveTrackers[iSlice].Initialize(param); | |
970 | if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT) | |
971 | { | |
972 | HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT); | |
973 | return(1); | |
974 | } | |
975 | return(0); | |
976 | } | |
977 | ||
31649d4b | 978 | int AliHLTTPCCAGPUTrackerNVCC::ExitGPU() |
eb08490f | 979 | { |
980 | //Uninitialize CUDA | |
981 | cudaThreadSynchronize(); | |
982 | if (fGPUMemory) | |
983 | { | |
984 | cudaFree(fGPUMemory); | |
985 | fGPUMemory = NULL; | |
986 | } | |
987 | if (fHostLockedMemory) | |
988 | { | |
989 | for (int i = 0;i < CAMath::Max(3, fSliceCount);i++) | |
990 | { | |
991 | cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]); | |
992 | } | |
993 | free(fpCudaStreams); | |
994 | fGpuTracker = NULL; | |
995 | cudaFreeHost(fHostLockedMemory); | |
996 | } | |
997 | ||
998 | if (CudaFailedMsg(cudaThreadExit())) | |
999 | { | |
1000 | HLTError("Could not uninitialize GPU"); | |
1001 | return(1); | |
1002 | } | |
1003 | HLTInfo("CUDA Uninitialized"); | |
1004 | fgGPUUsed = false; | |
3a4ab1ca | 1005 | fCudaInitialized = 0; |
eb08490f | 1006 | return(0); |
1007 | } | |
1008 | ||
31649d4b | 1009 | void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val) |
eb08490f | 1010 | { |
444e5682 | 1011 | //Set Output Control Pointers |
eb08490f | 1012 | fOutputControl = val; |
1013 | for (int i = 0;i < fgkNSlices;i++) | |
1014 | { | |
1015 | fSlaveTrackers[i].SetOutputControl(val); | |
1016 | } | |
1017 | } | |
1018 | ||
31649d4b | 1019 | int AliHLTTPCCAGPUTrackerNVCC::GetThread() |
73a33d2e | 1020 | { |
444e5682 | 1021 | //Get Thread ID |
73a33d2e | 1022 | #ifdef R__WIN32 |
1023 | return((int) (size_t) GetCurrentThread()); | |
1024 | #else | |
1025 | return((int) syscall (SYS_gettid)); | |
1026 | #endif | |
1027 | } | |
1028 | ||
31649d4b | 1029 | unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i) |
1030 | { | |
1031 | //Returns pointer to PerfTimer i of slice iSlice | |
1032 | return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL); | |
1033 | } | |
1034 | ||
1035 | const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const | |
1036 | { | |
1037 | //Return Pointer to Output Control Structure | |
1038 | return fOutputControl; | |
1039 | } | |
1040 | ||
1041 | int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const | |
1042 | { | |
1043 | //Return max slice count processable | |
1044 | return(fSliceCount); | |
1045 | } | |
1046 | ||
1047 | AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate() | |
1048 | { | |
1049 | return new AliHLTTPCCAGPUTrackerNVCC; | |
1050 | } | |
1051 | void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr) | |
1052 | { | |
1053 | delete ptr; | |
1054 | } | |
1055 |