1 // **************************************************************************
\r
2 // This file is property of and copyright by the ALICE HLT Project *
\r
3 // ALICE Experiment at CERN, All rights reserved. *
\r
5 // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
\r
6 // Ivan Kisel <kisel@kip.uni-heidelberg.de> *
\r
7 // David Rohr <drohr@kip.uni-heidelberg.de> *
\r
8 // for The ALICE HLT Project. *
\r
10 // Permission to use, copy, modify and distribute this software and its *
\r
11 // documentation strictly for non-commercial purposes is hereby granted *
\r
12 // without fee, provided that the above copyright notice appears in all *
\r
13 // copies and that both the copyright notice and this permission notice *
\r
14 // appear in the supporting documentation. The authors make no claims *
\r
15 // about the suitability of this software for any purpose. It is *
\r
16 // provided "as is" without express or implied warranty. *
\r
18 //***************************************************************************
\r
20 #include "AliHLTTPCCAGPUTrackerNVCC.h"
\r
22 #ifdef HLTCA_GPUCODE
\r
24 #include <sm_11_atomic_functions.h>
\r
25 #include <sm_12_atomic_functions.h>
\r
30 #include <sys/syscall.h>
\r
31 #include <semaphore.h>
\r
35 #include "AliHLTTPCCADef.h"
\r
36 #include "AliHLTTPCCAGPUConfig.h"
\r
41 //Disable assertions since they produce errors in GPU Code
\r
45 #define assert(param)
\r
47 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
\r
48 #ifdef HLTCA_GPU_TEXTURE_FETCH
\r
49 texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
\r
50 texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
\r
51 texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
\r
54 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
\r
55 #include "AliHLTTPCCATrackParam.cxx"
\r
56 #include "AliHLTTPCCATrack.cxx"
\r
58 #include "AliHLTTPCCAHitArea.cxx"
\r
59 #include "AliHLTTPCCAGrid.cxx"
\r
60 #include "AliHLTTPCCARow.cxx"
\r
61 #include "AliHLTTPCCAParam.cxx"
\r
62 #include "AliHLTTPCCATracker.cxx"
\r
64 #include "AliHLTTPCCAProcess.h"
\r
66 #include "AliHLTTPCCATrackletSelector.cxx"
\r
67 #include "AliHLTTPCCANeighboursFinder.cxx"
\r
68 #include "AliHLTTPCCANeighboursCleaner.cxx"
\r
69 #include "AliHLTTPCCAStartHitsFinder.cxx"
\r
70 #include "AliHLTTPCCAStartHitsSorter.cxx"
\r
71 #include "AliHLTTPCCATrackletConstructor.cxx"
\r
73 #include "MemoryAssignmentHelpers.h"
\r
75 #ifndef HLTCA_STANDALONE
\r
76 #include "AliHLTDefinitions.h"
\r
77 #include "AliHLTSystem.h"
\r
80 ClassImp( AliHLTTPCCAGPUTrackerNVCC )
\r
82 bool AliHLTTPCCAGPUTrackerNVCC::fgGPUUsed = false;
\r
84 #define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
\r
86 AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :
\r
89 fHostLockedMemory(NULL),
\r
91 fDebugMask(0xFFFFFFFF),
\r
94 fpCudaStreams(NULL),
\r
96 fOutputControl(NULL),
\r
98 fCudaInitialized(0),
\r
102 AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() {};
\r
104 void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)
\r
106 //Release the global named semaphore that locks GPU Initialization
\r
108 HANDLE* h = (HANDLE*) sem;
\r
109 ReleaseSemaphore(*h, 1, NULL);
\r
113 sem_t* pSem = (sem_t*) sem;
\r
115 sem_unlink(SemLockName);
\r
119 int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)
\r
121 //Check constants for correct memory sizes
\r
122 if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
\r
124 HLTError("Insufficiant Tracker Object Memory");
\r
128 if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
\r
130 HLTError("Insufficiant Common Memory");
\r
134 if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
\r
136 HLTError("Insufficiant Row Memory");
\r
140 if (fDebugLevel >= 3)
\r
142 HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, fgkNSlices * AliHLTTPCCATracker::CommonMemorySize(), HLTCA_GPU_COMMON_MEMORY, fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), HLTCA_GPU_ROWS_MEMORY);
\r
147 int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
\r
149 //Find best CUDA device, initialize and allocate memory
\r
151 if (CheckMemorySizes(sliceCount)) return(1);
\r
154 HANDLE* semLock = new HANDLE;
\r
155 *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
\r
156 if (*semLock == NULL)
\r
158 HLTError("Error creating GPUInit Semaphore");
\r
161 WaitForSingleObject(*semLock, INFINITE);
\r
163 sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
\r
164 if (semLock == SEM_FAILED)
\r
166 HLTError("Error creating GPUInit Semaphore");
\r
174 HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
\r
175 ReleaseGlobalLock(semLock);
\r
179 fThreadId = GetThread();
\r
181 cudaDeviceProp fCudaDeviceProp;
\r
183 fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
\r
185 #ifndef CUDA_DEVICE_EMULATION
\r
186 int count, bestDevice = -1;
\r
187 long long int bestDeviceSpeed = 0, deviceSpeed;
\r
188 if (CudaFailedMsg(cudaGetDeviceCount(&count)))
\r
190 HLTError("Error getting CUDA Device Count");
\r
192 ReleaseGlobalLock(semLock);
\r
195 if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
\r
196 for (int i = 0;i < count;i++)
\r
198 unsigned int free, total;
\r
200 CUdevice tmpDevice;
\r
201 cuDeviceGet(&tmpDevice, i);
\r
202 CUcontext tmpContext;
\r
203 cuCtxCreate(&tmpContext, 0, tmpDevice);
\r
204 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
\r
205 cuCtxDestroy(tmpContext);
\r
206 if (CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;
\r
208 int deviceOK = sliceCount <= fCudaDeviceProp.multiProcessorCount && fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;
\r
210 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
\r
211 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;
\r
212 if (deviceOK && deviceSpeed > bestDeviceSpeed)
\r
215 bestDeviceSpeed = deviceSpeed;
\r
218 if (bestDevice == -1)
\r
220 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
\r
221 HLTInfo("Requiring Revision 1.3, Mem: %d", fGPUMemSize);
\r
223 ReleaseGlobalLock(semLock);
\r
228 if (forceDeviceID == -1)
\r
229 cudaDevice = bestDevice;
\r
231 cudaDevice = forceDeviceID;
\r
233 int cudaDevice = 0;
\r
236 cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice );
\r
238 if (fDebugLevel >= 1)
\r
240 HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
\r
241 HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);
\r
242 HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);
\r
243 HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
\r
244 HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
\r
245 HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);
\r
246 HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
\r
247 HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
\r
248 HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
\r
249 HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);
\r
250 HLTInfo("major = %d", fCudaDeviceProp.major);
\r
251 HLTInfo("minor = %d", fCudaDeviceProp.minor);
\r
252 HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);
\r
253 HLTInfo("textureAlignment %lld= ", (unsigned long long int) fCudaDeviceProp.textureAlignment);
\r
256 if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
\r
258 HLTError( "Unsupported CUDA Device" );
\r
260 ReleaseGlobalLock(semLock);
\r
264 if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
\r
266 HLTError("Could not set CUDA Device!");
\r
268 ReleaseGlobalLock(semLock);
\r
272 if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
\r
274 HLTError("CUDA Memory Allocation Error");
\r
277 ReleaseGlobalLock(semLock);
\r
280 ReleaseGlobalLock(semLock);
\r
281 if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
\r
282 int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
\r
283 if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
\r
285 cudaFree(fGPUMemory);
\r
287 HLTError("Error allocating Page Locked Host Memory");
\r
291 if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
\r
293 if (fDebugLevel >= 1)
\r
295 CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
\r
298 fSliceCount = sliceCount;
\r
299 //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
\r
300 fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
\r
302 for (int i = 0;i < fgkNSlices;i++)
\r
304 fSlaveTrackers[i].SetGPUTracker();
\r
305 fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
\r
306 fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
\r
309 fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
\r
310 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
\r
311 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
\r
313 if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
\r
315 cudaFree(fGPUMemory);
\r
316 cudaFreeHost(fHostLockedMemory);
\r
318 HLTError("Error creating CUDA Stream");
\r
324 fCudaInitialized = 1;
\r
325 HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", cudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount);
\r
327 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
\r
328 if (fDebugLevel < 2)
\r
330 //Do one initial run for Benchmark reasons
\r
331 const int useDebugLevel = fDebugLevel;
\r
333 AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
\r
337 AliHLTTPCCAParam tmpParam;
\r
338 AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
\r
340 fin.open("events/settings.dump");
\r
343 for (int i = 0;i < sliceCount;i++)
\r
345 fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
\r
346 tmpParam.ReadSettings(fin);
\r
347 InitializeSliceParam(i, tmpParam);
\r
351 fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
\r
352 for (int i = 0;i < sliceCount;i++)
\r
354 tmpCluster[i].StartReading(i, 0);
\r
355 tmpCluster[i].ReadEvent(fin);
\r
356 tmpCluster[i].FinishReading();
\r
360 AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
\r
361 memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
\r
363 Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
\r
364 for (int i = 0;i < sliceCount;i++)
\r
366 free(tmpOutput[i]);
\r
367 tmpOutput[i] = NULL;
\r
368 fSlaveTrackers[i].SetOutputControl(NULL);
\r
370 delete[] tmpOutput;
\r
371 delete[] tmpCluster;
\r
372 fDebugLevel = useDebugLevel;
\r
378 template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)
\r
380 //Macro to align Pointers.
\r
381 //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
\r
382 //(As long as every single data structure is <= 1 MB)
\r
384 size_t adr = (size_t) ptr;
\r
385 if (adr % alignment)
\r
387 adr += alignment - (adr % alignment);
\r
392 bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsg(cudaError_t error)
\r
394 //Check for CUDA Error and in the case of an error display the corresponding error string
\r
395 if (error == cudaSuccess) return(false);
\r
396 HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
\r
400 int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice)
\r
402 //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
\r
404 if (fDebugLevel == 0) return(0);
\r
406 cuErr = cudaGetLastError();
\r
407 if (cuErr != cudaSuccess)
\r
409 HLTError("Cuda Error %s while running kernel (%s) (Slice %d; %d/%d)", cudaGetErrorString(cuErr), state, sliceLocal, slice, fgkNSlices);
\r
412 if (CudaFailedMsg(cudaThreadSynchronize()))
\r
414 HLTError("CUDA Error while synchronizing (%s) (Slice %d; %d/%d)", state, sliceLocal, slice, fgkNSlices);
\r
417 if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
\r
421 void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
\r
423 //Set Debug Level and Debug output File if applicable
\r
424 fDebugLevel = dwLevel;
\r
425 if (NewOutFile) fOutFile = NewOutFile;
\r
428 int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int OptionValue)
\r
430 //Set a specific GPU Tracker Option
\r
431 if (strcmp(OptionName, "PPMode") == 0)
\r
433 fPPMode = OptionValue;
\r
435 else if (strcmp(OptionName, "DebugMask") == 0)
\r
437 fDebugMask = OptionValue;
\r
441 HLTError("Unknown Option: %s", OptionName);
\r
447 #ifdef HLTCA_STANDALONE
\r
448 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)
\r
450 //Run Performance Query for timer i of slice iSlice
\r
451 if (fDebugLevel >= 1)
\r
453 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
\r
457 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
\r
460 void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
\r
462 //Dump Rowblocks to File
\r
463 if (fDebugLevel >= 4)
\r
465 *fOutFile << "RowBlock Tracklets (Slice " << tracker[iSlice].Param().ISlice() << " (" << iSlice << " of reco))";
\r
466 *fOutFile << " after Tracklet Reconstruction";
\r
467 *fOutFile << std::endl;
\r
469 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
\r
470 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
\r
471 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
\r
472 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
\r
473 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
\r
474 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
\r
475 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
\r
477 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
\r
478 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
\r
480 *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
\r
481 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
\r
482 for (int j = 0;j < rowBlockPos[i].x;j++)
\r
484 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
\r
485 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
\r
486 #ifdef HLTCA_GPU_SCHED_FIXED_START
\r
487 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
\r
489 HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
\r
491 #endif //HLTCA_GPU_SCHED_FIXED_START
\r
493 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
\r
495 HLTError("Error, -1 Tracklet found");
\r
498 *fOutFile << std::endl << "Phase 2: ";
\r
499 for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
\r
501 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
\r
503 *fOutFile << std::endl;
\r
508 *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
\r
509 for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
\r
511 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
\r
516 free(rowBlockTracklets);
\r
517 free(blockStartingTracklet);
\r
521 __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
\r
523 //Initialize GPU RowBlocks and HitWeights
\r
524 int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
\r
525 int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
\r
526 const int stride = blockDim.x * gridDim.x;
\r
528 i0.x = i0.y = i0.z = i0.w = 0;
\r
529 i1.x = i1.y = i1.z = i1.w = -1;
\r
530 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
\r
531 RowBlockPos[i] = i0;
\r
532 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
\r
533 rowBlockTracklets4[i] = i1;
\r
534 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
\r
535 sliceDataHitWeights4[i] = i0;
\r
538 int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
\r
540 //Primary reconstruction function
\r
542 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
\r
544 if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
\r
546 if (!fCudaInitialized)
\r
548 HLTError("GPUTracker not initialized");
\r
551 if (sliceCountLocal > fSliceCount)
\r
553 HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
\r
556 if (fThreadId != GetThread())
\r
558 HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());
\r
562 if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
\r
564 if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
\r
566 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
\r
570 if (fPPMode) return(ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal));
\r
572 memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
\r
574 if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
\r
576 #ifdef HLTCA_GPU_TIME_PROFILE
\r
577 unsigned long long int a, b, c, d;
\r
578 AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
\r
579 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
\r
582 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
\r
584 //Make this a GPU Tracker
\r
585 fGpuTracker[iSlice].SetGPUTracker();
\r
586 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
\r
587 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
\r
588 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
\r
590 //Set Pointers to GPU Memory
\r
591 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
\r
593 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
\r
594 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
\r
595 tmpMem = alignPointer(tmpMem, 1024 * 1024);
\r
597 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
\r
598 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
\r
599 tmpMem = alignPointer(tmpMem, 1024 * 1024);
\r
601 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
\r
602 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
\r
603 tmpMem = alignPointer(tmpMem, 1024 * 1024);
\r
605 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
\r
607 HLTError("Insufficiant Track Memory");
\r
611 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
\r
613 HLTError("Insufficiant Global Memory");
\r
617 if (fDebugLevel >= 3)
\r
619 HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice), HLTCA_GPU_GLOBAL_MEMORY, fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY);
\r
622 //Initialize Startup Constants
\r
623 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
\r
624 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
\r
625 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
\r
626 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
\r
627 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
\r
628 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
\r
629 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
\r
630 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
\r
631 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
\r
634 #ifdef HLTCA_GPU_TEXTURE_FETCH
\r
635 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
\r
637 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
\r
639 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
\r
642 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
\r
643 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
\r
645 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
\r
648 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
\r
649 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
\r
651 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
\r
656 //Copy Tracker Object to GPU Memory
\r
657 if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
\r
658 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
\r
660 if (CudaFailedMsg(cudaMalloc(&tmpMem, 100000000))) return(1);
\r
661 fGpuTracker[0].fStageAtSync = tmpMem;
\r
662 CudaFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));
\r
664 CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
\r
666 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
\r
668 StandalonePerfTime(firstSlice + iSlice, 0);
\r
670 //Initialize GPU Slave Tracker
\r
671 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
\r
672 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
\r
673 #ifdef HLTCA_GPU_TIME_PROFILE
\r
674 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
\r
676 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
\r
678 if (fDebugLevel >= 4)
\r
680 *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl;
\r
682 if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);
\r
685 #ifdef HLTCA_GPU_TIME_PROFILE
\r
686 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
\r
687 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
\r
689 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
\r
691 HLTError("Insufficiant Slice Data Memory");
\r
695 if (fDebugLevel >= 3)
\r
697 HLTInfo("GPU Slice Data Memory Used: %d/%d", fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY);
\r
700 //Initialize temporary memory where needed
\r
701 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
\r
702 PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
\r
704 //Copy Data to GPU Global Memory
\r
705 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
\r
706 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
\r
707 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
\r
709 if (fDebugLevel >= 4)
\r
711 if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
\r
712 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS);
\r
713 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
\r
716 if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);
\r
717 StandalonePerfTime(firstSlice + iSlice, 1);
\r
719 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);
\r
720 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
\r
722 if (CUDASync("Neighbours finder", iSlice, iSlice + firstSlice)) return 1;
\r
724 StandalonePerfTime(firstSlice + iSlice, 2);
\r
726 if (fDebugLevel >= 4)
\r
728 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
\r
729 if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
\r
732 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);
\r
733 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
\r
734 if (CUDASync("Neighbours Cleaner", iSlice, iSlice + firstSlice)) return 1;
\r
736 StandalonePerfTime(firstSlice + iSlice, 3);
\r
738 if (fDebugLevel >= 4)
\r
740 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
\r
741 if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
\r
744 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);
\r
745 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
\r
746 if (CUDASync("Start Hits Finder", iSlice, iSlice + firstSlice)) return 1;
\r
748 StandalonePerfTime(firstSlice + iSlice, 4);
\r
750 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);
\r
751 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
\r
752 if (CUDASync("Start Hits Sorter", iSlice, iSlice + firstSlice)) return 1;
\r
754 StandalonePerfTime(firstSlice + iSlice, 5);
\r
756 if (fDebugLevel >= 2)
\r
758 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
\r
759 if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);
\r
760 if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
\r
762 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
\r
767 if (fDebugLevel >= 4)
\r
769 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
\r
770 if (fDebugMask & 8)
\r
772 *fOutFile << "Temporary ";
\r
773 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
\r
775 uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
\r
776 CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
\r
777 if (fDebugMask & 16)
\r
779 *fOutFile << "Start Hits Sort Vector:" << std::endl;
\r
780 for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
\r
782 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
\r
787 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
\r
788 if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
\r
791 StandalonePerfTime(firstSlice + iSlice, 6);
\r
793 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
\r
796 StandalonePerfTime(firstSlice, 7);
\r
797 #ifdef HLTCA_GPU_PREFETCHDATA
\r
798 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
\r
800 if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
\r
802 HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
\r
805 if (fDebugLevel >= 1)
\r
807 static int infoShown = 0;
\r
810 HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
\r
817 int nHardCollisions = 0;
\r
819 RestartTrackletConstructor:
\r
820 if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
\r
821 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
\r
823 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
\r
824 if (CUDASync("Tracklet Initializer", iSlice, iSlice + firstSlice)) return 1;
\r
825 if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice);
\r
828 if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
\r
829 AliHLTTPCCATrackletConstructorGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
\r
830 if (CUDASync("Tracklet Constructor", 0, firstSlice)) return 1;
\r
832 StandalonePerfTime(firstSlice, 8);
\r
834 if (fDebugLevel >= 4)
\r
836 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
\r
838 if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
\r
839 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
\r
840 if (fDebugLevel >= 5)
\r
842 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
\r
844 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
\r
845 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
\r
846 if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
\r
851 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
\r
853 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
\r
854 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
\r
855 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
\r
856 if (CUDASync("Tracklet Selector", iSlice, iSlice + firstSlice)) return 1;
\r
858 StandalonePerfTime(firstSlice, 9);
\r
860 int tmpSlice = 0, tmpSlice2 = 0;
\r
861 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
\r
863 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
\r
865 while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == CUDA_SUCCESS))
\r
867 if (CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice]))) return(1);
\r
871 while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == CUDA_SUCCESS)
\r
873 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
\r
874 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
\r
878 cudaStreamSynchronize(cudaStreams[iSlice]);
\r
880 if (fDebugLevel >= 4)
\r
882 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
\r
883 if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
\r
884 if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
\r
887 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
\r
889 if ((fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION || fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_WRONG_ROW)&& nHardCollisions++ < 10)
\r
891 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION)
\r
893 HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");
\r
897 HLTWarning("Tracklet Constructor returned invalid row");
\r
899 for (int i = 0;i < sliceCountLocal;i++)
\r
901 cudaThreadSynchronize();
\r
902 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
\r
903 *fSlaveTrackers[firstSlice + i].NTracks() = 0;
\r
904 *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
\r
905 fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
\r
906 CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
\r
907 PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
\r
909 goto RestartTrackletConstructor;
\r
911 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
\r
914 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
\r
916 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
\r
917 #ifdef HLTCA_GPU_TIME_PROFILE
\r
918 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
\r
920 fSlaveTrackers[firstSlice + iSlice].WriteOutput();
\r
921 #ifdef HLTCA_GPU_TIME_PROFILE
\r
922 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
\r
923 printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
\r
926 if (fDebugLevel >= 4)
\r
928 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
\r
929 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
\r
933 StandalonePerfTime(firstSlice, 10);
\r
935 if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
\r
937 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
\r
938 char* stageAtSync = (char*) malloc(100000000);
\r
939 CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
\r
940 cudaFree(fGpuTracker[0].StageAtSync());
\r
942 FILE* fp = fopen("profile.txt", "w+");
\r
943 FILE* fp2 = fopen("profile.bmp", "w+b");
\r
944 int nEmptySync = 0, fEmpty;
\r
946 const int bmpheight = 1000;
\r
947 BITMAPFILEHEADER bmpFH;
\r
948 BITMAPINFOHEADER bmpIH;
\r
949 ZeroMemory(&bmpFH, sizeof(bmpFH));
\r
950 ZeroMemory(&bmpIH, sizeof(bmpIH));
\r
952 bmpFH.bfType = 19778; //"BM"
\r
953 bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
\r
954 bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
\r
956 bmpIH.biSize = sizeof(bmpIH);
\r
957 bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
\r
958 bmpIH.biHeight = bmpheight;
\r
959 bmpIH.biPlanes = 1;
\r
960 bmpIH.biBitCount = 32;
\r
962 fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
\r
963 fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
\r
965 for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
\r
968 for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
\r
970 fprintf(fp, "%d\t", stageAtSync[i + j]);
\r
972 if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
\r
973 if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
\r
974 if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
\r
975 if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
\r
976 fwrite(&color, 1, sizeof(int), fp2);
\r
977 if (j > 0 && j % 32 == 0)
\r
979 color = RGB(255, 255, 255);
\r
980 fwrite(&color, 1, 4, fp2);
\r
982 if (stageAtSync[i + j]) fEmpty = 0;
\r
985 if (fEmpty) nEmptySync++;
\r
986 else nEmptySync = 0;
\r
987 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
\r
998 __global__ void ClearPPHitWeights(int sliceCount)
\r
1000 //Clear HitWeights
\r
1002 for (int k = 0;k < sliceCount;k++)
\r
1004 AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k];
\r
1005 int4* const pHitWeights = (int4*) tracker.Data().HitWeights();
\r
1006 const int dwCount = tracker.Data().NumberOfHitsPlusAlign();
\r
1007 const int stride = blockDim.x * gridDim.x;
\r
1009 i0.x = i0.y = i0.z = i0.w = 0;
\r
1011 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < dwCount * sizeof(int) / sizeof(int4);i += stride)
\r
1013 pHitWeights[i] = i0;
\r
1018 int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
\r
1020 //Primary reconstruction function for small events (PP)
\r
1022 memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
\r
1024 if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
\r
1026 #ifdef HLTCA_GPU_TIME_PROFILE
\r
1027 unsigned long long int a, b, c, d;
\r
1028 AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
\r
1029 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
\r
1032 char* tmpSliceMemHost = (char*) SliceDataMemory(fHostLockedMemory, 0);
\r
1033 char* tmpSliceMemGpu = (char*) SliceDataMemory(fGPUMemory, 0);
\r
1035 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
\r
1037 StandalonePerfTime(firstSlice + iSlice, 0);
\r
1039 //Initialize GPU Slave Tracker
\r
1040 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
\r
1041 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(tmpSliceMemHost, RowMemory(fHostLockedMemory, firstSlice + iSlice));
\r
1042 #ifdef HLTCA_GPU_TIME_PROFILE
\r
1043 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
\r
1045 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
\r
1046 #ifdef HLTCA_GPU_TIME_PROFILE
\r
1047 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
\r
1048 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
\r
1050 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
\r
1052 HLTError("Insufficiant Slice Data Memory");
\r
1056 //Make this a GPU Tracker
\r
1057 fGpuTracker[iSlice].SetGPUTracker();
\r
1058 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
\r
1061 fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice));
\r
1062 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
\r
1064 tmpSliceMemHost += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
\r
1065 tmpSliceMemHost = alignPointer(tmpSliceMemHost, 64 * 1024);
\r
1066 tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
\r
1067 tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024);
\r
1070 //Set Pointers to GPU Memory
\r
1071 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
\r
1073 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
\r
1074 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
\r
1075 tmpMem = alignPointer(tmpMem, 64 * 1024);
\r
1077 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
\r
1078 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
\r
1079 tmpMem = alignPointer(tmpMem, 64 * 1024);
\r
1081 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
\r
1082 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
\r
1083 tmpMem = alignPointer(tmpMem, 64 * 1024);
\r
1085 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
\r
1087 HLTError("Insufficiant Track Memory");
\r
1091 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
\r
1093 HLTError("Insufficiant Global Memory");
\r
1097 //Initialize Startup Constants
\r
1098 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
\r
1099 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
\r
1100 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
\r
1101 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
\r
1103 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
\r
1105 if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);
\r
1106 StandalonePerfTime(firstSlice + iSlice, 1);
\r
1109 #ifdef HLTCA_GPU_TEXTURE_FETCH
\r
1110 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
\r
1112 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
\r
1114 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
\r
1117 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
\r
1118 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
\r
1120 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
\r
1123 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
\r
1124 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
\r
1126 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
\r
1131 //Copy Tracker Object to GPU Memory
\r
1132 if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
\r
1133 CudaFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));
\r
1135 //Copy Data to GPU Global Memory
\r
1136 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
\r
1138 CudaFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));
\r
1139 //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize());
\r
1141 //CudaFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));
\r
1142 //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0)));
\r
1143 CudaFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));
\r
1144 CudaFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));
\r
1146 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
\r
1147 AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows());
\r
1148 if (CUDASync("Neighbours finder", 0, firstSlice)) return 1;
\r
1149 StandalonePerfTime(firstSlice, 2);
\r
1150 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
\r
1151 AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2);
\r
1152 if (CUDASync("Neighbours Cleaner", 0, firstSlice)) return 1;
\r
1153 StandalonePerfTime(firstSlice, 3);
\r
1154 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
\r
1155 AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6);
\r
1156 if (CUDASync("Start Hits Finder", 0, firstSlice)) return 1;
\r
1157 StandalonePerfTime(firstSlice, 4);
\r
1159 ClearPPHitWeights <<<30, 256>>>(sliceCountLocal);
\r
1160 if (CUDASync("Clear Hit Weights", 0, firstSlice)) return 1;
\r
1162 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
\r
1164 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
\r
1167 StandalonePerfTime(firstSlice, 7);
\r
1169 if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
\r
1170 AliHLTTPCCATrackletConstructorGPUPP<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal);
\r
1171 if (CUDASync("Tracklet Constructor PP", 0, firstSlice)) return 1;
\r
1173 StandalonePerfTime(firstSlice, 8);
\r
1175 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal);
\r
1176 if (CUDASync("Tracklet Selector", 0, firstSlice)) return 1;
\r
1177 StandalonePerfTime(firstSlice, 9);
\r
1179 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));
\r
1181 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
\r
1183 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
\r
1185 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));
\r
1186 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));
\r
1188 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
\r
1190 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
\r
1193 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
\r
1195 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
\r
1196 #ifdef HLTCA_GPU_TIME_PROFILE
\r
1197 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
\r
1199 fSlaveTrackers[firstSlice + iSlice].WriteOutput();
\r
1200 #ifdef HLTCA_GPU_TIME_PROFILE
\r
1201 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
\r
1202 printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
\r
1206 StandalonePerfTime(firstSlice, 10);
\r
1208 if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
\r
1213 int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m)
\r
1215 //Initialize Slice Tracker Parameter for a slave tracker
\r
1216 fSlaveTrackers[iSlice].Initialize(param);
\r
1217 if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
\r
1219 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
\r
1225 int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
\r
1227 //Uninitialize CUDA
\r
1228 cudaThreadSynchronize();
\r
1231 cudaFree(fGPUMemory);
\r
1232 fGPUMemory = NULL;
\r
1234 if (fHostLockedMemory)
\r
1236 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
\r
1238 cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
\r
1240 free(fpCudaStreams);
\r
1241 fGpuTracker = NULL;
\r
1242 cudaFreeHost(fHostLockedMemory);
\r
1245 if (CudaFailedMsg(cudaThreadExit()))
\r
1247 HLTError("Could not uninitialize GPU");
\r
1250 HLTInfo("CUDA Uninitialized");
\r
1251 fgGPUUsed = false;
\r
1252 fCudaInitialized = 0;
\r
1256 void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
\r
1258 //Set Output Control Pointers
\r
1259 fOutputControl = val;
\r
1260 for (int i = 0;i < fgkNSlices;i++)
\r
1262 fSlaveTrackers[i].SetOutputControl(val);
\r
1266 int AliHLTTPCCAGPUTrackerNVCC::GetThread()
\r
1270 return((int) (size_t) GetCurrentThread());
\r
1272 return((int) syscall (SYS_gettid));
\r
1276 unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)
\r
1278 //Returns pointer to PerfTimer i of slice iSlice
\r
1279 return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);
\r
1282 const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const
\r
1284 //Return Pointer to Output Control Structure
\r
1285 return fOutputControl;
\r
1288 int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const
\r
1290 //Return max slice count processable
\r
1291 return(fSliceCount);
\r
1294 AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
\r
1296 return new AliHLTTPCCAGPUTrackerNVCC;
\r
1299 void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)
\r