1 // **************************************************************************
2 // This file is property of and copyright by the ALICE HLT Project *
3 // ALICE Experiment at CERN, All rights reserved. *
5 // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
6 // Ivan Kisel <kisel@kip.uni-heidelberg.de> *
7 // David Rohr <drohr@kip.uni-heidelberg.de> *
8 // for The ALICE HLT Project. *
10 // Permission to use, copy, modify and distribute this software and its *
11 // documentation strictly for non-commercial purposes is hereby granted *
12 // without fee, provided that the above copyright notice appears in all *
13 // copies and that both the copyright notice and this permission notice *
14 // appear in the supporting documentation. The authors make no claims *
15 // about the suitability of this software for any purpose. It is *
16 // provided "as is" without express or implied warranty. *
18 //***************************************************************************
20 #include "AliHLTTPCCAGPUTracker.h"
24 #include "AliHLTTPCCADef.h"
25 #include "AliHLTTPCCAGPUConfig.h"
27 #include <sm_11_atomic_functions.h>
28 #include <sm_12_atomic_functions.h>
32 //Disable assertions since they produce errors in GPU Code
38 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
39 #ifdef HLTCA_GPU_TEXTURE_FETCH
40 texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
41 texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
42 texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
45 #include "AliHLTTPCCAHit.h"
47 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
48 #include "AliHLTTPCCATrackParam.cxx"
49 #include "AliHLTTPCCATrack.cxx"
51 #include "AliHLTTPCCATrackletSelector.cxx"
53 #include "AliHLTTPCCAHitArea.cxx"
54 #include "AliHLTTPCCAGrid.cxx"
55 #include "AliHLTTPCCARow.cxx"
56 #include "AliHLTTPCCAParam.cxx"
57 #include "AliHLTTPCCATracker.cxx"
59 #include "AliHLTTPCCAOutTrack.cxx"
61 #include "AliHLTTPCCAProcess.h"
63 #include "AliHLTTPCCANeighboursFinder.cxx"
65 #include "AliHLTTPCCANeighboursCleaner.cxx"
66 #include "AliHLTTPCCAStartHitsFinder.cxx"
67 #include "AliHLTTPCCAStartHitsSorter.cxx"
68 #include "AliHLTTPCCATrackletConstructor.cxx"
69 #include "AliHLTTPCCASliceOutput.cxx"
71 #include "MemoryAssignmentHelpers.h"
73 #ifndef HLTCA_STANDALONE
74 #include "AliHLTDefinitions.h"
75 #include "AliHLTSystem.h"
78 ClassImp( AliHLTTPCCAGPUTracker )
80 bool AliHLTTPCCAGPUTracker::fgGPUUsed = false;
82 int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
84 //Find best CUDA device, initialize and allocate memory
88 HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
92 cudaDeviceProp fCudaDeviceProp;
94 #ifndef CUDA_DEVICE_EMULATION
95 int count, bestDevice = -1, bestDeviceSpeed = 0;
96 if (CudaFailedMsg(cudaGetDeviceCount(&count)))
98 HLTError("Error getting CUDA Device Count");
101 if (fDebugLevel >= 2) std::cout << "Available CUDA devices: ";
102 for (int i = 0;i < count;i++)
104 cudaGetDeviceProperties(&fCudaDeviceProp, i);
105 if (fDebugLevel >= 2) std::cout << fCudaDeviceProp.name << " (" << i << ") ";
106 if (fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate > bestDeviceSpeed)
109 bestDeviceSpeed = fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate;
112 if (fDebugLevel >= 2) std::cout << std::endl;
114 if (bestDevice == -1)
116 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
121 if (forceDeviceID == -1)
122 cudaDevice = bestDevice;
124 cudaDevice = forceDeviceID;
129 cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice );
131 if (fDebugLevel >= 1)
133 std::cout<<"CUDA Device Properties: "<<std::endl;
134 std::cout<<"name = "<<fCudaDeviceProp.name<<std::endl;
135 std::cout<<"totalGlobalMem = "<<fCudaDeviceProp.totalGlobalMem<<std::endl;
136 std::cout<<"sharedMemPerBlock = "<<fCudaDeviceProp.sharedMemPerBlock<<std::endl;
137 std::cout<<"regsPerBlock = "<<fCudaDeviceProp.regsPerBlock<<std::endl;
138 std::cout<<"warpSize = "<<fCudaDeviceProp.warpSize<<std::endl;
139 std::cout<<"memPitch = "<<fCudaDeviceProp.memPitch<<std::endl;
140 std::cout<<"maxThreadsPerBlock = "<<fCudaDeviceProp.maxThreadsPerBlock<<std::endl;
141 std::cout<<"maxThreadsDim = "<<fCudaDeviceProp.maxThreadsDim[0]<<" "<<fCudaDeviceProp.maxThreadsDim[1]<<" "<<fCudaDeviceProp.maxThreadsDim[2]<<std::endl;
142 std::cout<<"maxGridSize = " <<fCudaDeviceProp.maxGridSize[0]<<" "<<fCudaDeviceProp.maxGridSize[1]<<" "<<fCudaDeviceProp.maxGridSize[2]<<std::endl;
143 std::cout<<"totalConstMem = "<<fCudaDeviceProp.totalConstMem<<std::endl;
144 std::cout<<"major = "<<fCudaDeviceProp.major<<std::endl;
145 std::cout<<"minor = "<<fCudaDeviceProp.minor<<std::endl;
146 std::cout<<"clockRate = "<<fCudaDeviceProp.clockRate<<std::endl;
147 std::cout<<"textureAlignment = "<<fCudaDeviceProp.textureAlignment<<std::endl;
150 if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
152 HLTError( "Unsupported CUDA Device" );
156 if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
158 HLTError("Could not set CUDA Device!");
162 if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
164 HLTError("Insufficiant Common Memory");
169 if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
171 HLTError("Insufficiant Row Memory");
176 fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
177 if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
179 HLTError("CUDA Memory Allocation Error");
183 if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
184 int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
185 if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
187 cudaFree(fGPUMemory);
189 HLTError("Error allocating Page Locked Host Memory");
192 if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
194 if (fDebugLevel >= 1)
196 CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
198 HLTInfo("CUDA Initialisation successfull");
200 //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
201 if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
203 HLTError("Insufficiant Tracker Object Memory");
206 fSliceCount = sliceCount;
207 fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
209 for (int i = 0;i < fgkNSlices;i++)
211 fSlaveTrackers[i].SetGPUTracker();
212 fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
213 fSlaveTrackers[i].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
216 fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
217 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
218 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
220 if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
222 HLTError("Error creating CUDA Stream");
227 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
230 //Do one initial run for Benchmark reasons
231 const int useDebugLevel = fDebugLevel;
233 AliHLTTPCCAClusterData tmpCluster;
236 fin.open("events/event.0.dump");
237 tmpCluster.ReadEvent(fin);
240 AliHLTTPCCASliceOutput *tmpOutput = NULL;
241 AliHLTTPCCAParam tmpParam;
242 AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
243 fSlaveTrackers[0].SetOutputControl(&tmpOutputControl);
244 tmpParam.SetNRows(HLTCA_ROW_COUNT);
245 fSlaveTrackers[0].SetParam(tmpParam);
246 Reconstruct(&tmpOutput, &tmpCluster, 0, 1);
249 fSlaveTrackers[0].SetOutputControl(NULL);
250 fDebugLevel = useDebugLevel;
257 template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment)
259 //Macro to align Pointers.
260 //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
261 //(As long as every single data structure is <= 1 MB)
263 size_t adr = (size_t) ptr;
266 adr += alignment - (adr % alignment);
271 bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
273 //Check for CUDA Error and in the case of an error display the corresponding error string
274 if (error == cudaSuccess) return(false);
275 HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
279 int AliHLTTPCCAGPUTracker::CUDASync(char* state)
281 //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
283 if (fDebugLevel == 0) return(0);
285 cuErr = cudaGetLastError();
286 if (cuErr != cudaSuccess)
288 HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state);
291 if (CudaFailedMsg(cudaThreadSynchronize()))
293 HLTError("CUDA Error while synchronizing (%s)", state);
296 if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
300 void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
302 //Set Debug Level and Debug output File if applicable
303 fDebugLevel = dwLevel;
304 if (NewOutFile) fOutFile = NewOutFile;
307 int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
309 //Set a specific GPU Tracker Option
311 HLTError("Unknown Option: %s", OptionName);
314 //No Options used at the moment
318 #ifdef HLTCA_STANDALONE
319 void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
321 //Run Performance Query for timer i of slice iSlice
322 if (fDebugLevel >= 1)
324 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
328 void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
331 void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
333 //Dump Rowblocks to File
334 if (fDebugLevel >= 4)
336 *fOutFile << "RowBlock Tracklets" << std::endl;
338 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
339 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
340 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
341 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
342 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
343 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
344 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
346 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
347 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
349 *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
350 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << endl << "Phase 1: ";
351 for (int j = 0;j < rowBlockPos[i].x;j++)
353 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
354 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
355 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
357 HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
360 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
362 HLTError("Error, -1 Tracklet found");
365 *fOutFile << endl << "Phase 2: ";
366 for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
368 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
375 *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
376 for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
378 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
383 free(rowBlockTracklets);
384 free(blockStartingTracklet);
388 __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
390 //Initialize GPU RowBlocks and HitWeights
391 int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
392 int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
393 const int stride = blockDim.x * gridDim.x;
395 i0.x = i0.y = i0.z = i0.w = 0;
396 i1.x = i1.y = i1.z = i1.w = -1;
397 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
399 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
400 rowBlockTracklets4[i] = i1;
401 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
402 sliceDataHitWeights4[i] = i0;
405 int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
407 //Primary reconstruction function
408 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
410 if (sliceCountLocal == -1) sliceCountLocal = this->fSliceCount;
412 if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
414 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
418 if (fDebugLevel >= 4)
420 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
422 *fOutFile << endl << endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << endl;
426 memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
428 if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice + sliceCountLocal].Param().ISlice());
429 if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
431 #ifdef HLTCA_GPU_TIME_PROFILE
433 QueryPerformanceFrequency((LARGE_INTEGER*) &c);
434 QueryPerformanceCounter((LARGE_INTEGER*) &d);
437 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
439 //Make this a GPU Tracker
440 fGpuTracker[iSlice].SetGPUTracker();
441 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
442 fGpuTracker[iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
443 fGpuTracker[iSlice].pData()->SetPointers(&pClusterData[iSlice], false);
445 //Set Pointers to GPU Memory
446 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
448 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
449 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
450 tmpMem = alignPointer(tmpMem, 1024 * 1024);
452 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
453 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
454 tmpMem = alignPointer(tmpMem, 1024 * 1024);
456 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
457 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
458 tmpMem = alignPointer(tmpMem, 1024 * 1024);
460 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
462 HLTError("Insufficiant Track Memory");
466 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
468 HLTError("Insufficiant Global Memory");
472 //Initialize Startup Constants
473 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
474 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
475 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
476 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
477 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
478 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
479 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
480 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
481 fGpuTracker[iSlice].pData()->SetGPUTextureBase(fGpuTracker[0].Data().Memory());
484 #ifdef HLTCA_GPU_TEXTURE_FETCH
485 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
487 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
489 HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
492 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
493 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
495 HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
498 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
499 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
501 HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
506 //Copy Tracker Object to GPU Memory
507 if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
508 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
509 if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1);
510 CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000));
512 CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
514 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
516 StandalonePerfTime(firstSlice + iSlice, 0);
518 //Initialize GPU Slave Tracker
519 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
520 fSlaveTrackers[firstSlice + iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
521 #ifdef HLTCA_GPU_TIME_PROFILE
522 QueryPerformanceCounter((LARGE_INTEGER*) &a);
524 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
525 #ifdef HLTCA_GPU_TIME_PROFILE
526 QueryPerformanceCounter((LARGE_INTEGER*) &b);
527 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
529 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
531 HLTError("Insufficiant Slice Data Memory");
535 /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice())
537 if (fDebugLevel >= 3) HLTInfo("Slice Empty, not running GPU Tracker");
538 if (sliceCountLocal == 1)
542 //Initialize temporary memory where needed
543 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
544 PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
546 //Copy Data to GPU Global Memory
547 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
548 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
549 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
551 if (fDebugLevel >= 4)
553 if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
554 fSlaveTrackers[firstSlice + iSlice].TrackletMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] );
555 fSlaveTrackers[firstSlice + iSlice].SetPointersTracklets( HLTCA_GPU_MAX_TRACKLETS );
556 fSlaveTrackers[firstSlice + iSlice].HitMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100] );
557 fSlaveTrackers[firstSlice + iSlice].SetPointersHits( pClusterData[iSlice].NumberOfClusters() );
560 if (CUDASync("Initialization")) return(1);
561 StandalonePerfTime(firstSlice + iSlice, 1);
563 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
564 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
566 if (CUDASync("Neighbours finder")) return 1;
568 StandalonePerfTime(firstSlice + iSlice, 2);
570 if (fDebugLevel >= 4)
572 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
573 fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
576 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
577 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
578 if (CUDASync("Neighbours Cleaner")) return 1;
580 StandalonePerfTime(firstSlice + iSlice, 3);
582 if (fDebugLevel >= 4)
584 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
585 fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
588 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
589 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
590 if (CUDASync("Start Hits Finder")) return 1;
592 StandalonePerfTime(firstSlice + iSlice, 4);
594 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter");
595 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
596 if (CUDASync("Start Hits Sorter")) return 1;
598 StandalonePerfTime(firstSlice + iSlice, 5);
600 if (fDebugLevel >= 2)
602 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
603 if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
604 if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
606 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
611 if (fDebugLevel >= 4)
613 *fOutFile << "Temporary ";
614 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
615 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
616 uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
617 CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
618 *fOutFile << "Start Hits Sort Vector:" << std::endl;
619 for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
621 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
625 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
626 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
629 StandalonePerfTime(firstSlice + iSlice, 6);
631 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
634 StandalonePerfTime(firstSlice, 7);
635 #ifdef HLTCA_GPU_PREFETCHDATA
636 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
638 if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
640 HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
643 if (fDebugLevel >= 1)
645 static int infoShown = 0;
648 HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
655 if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
656 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
658 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
659 if (CUDASync("Tracklet Initializer")) return 1;
660 DumpRowBlocks(fSlaveTrackers, iSlice);
663 if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
664 AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
665 if (CUDASync("Tracklet Constructor (new)")) return 1;
667 StandalonePerfTime(firstSlice, 8);
669 if (fDebugLevel >= 4)
671 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
673 DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
674 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
675 if (fDebugLevel >= 5)
677 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
679 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
680 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
681 fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
685 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT)
687 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT);
688 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT, sliceCountLocal - iSlice));
690 if (CUDASync("Tracklet Selector")) return 1;
691 StandalonePerfTime(firstSlice, 9);
693 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0]));
694 for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++)
696 if (iSliceTmp < sliceCountLocal)
698 int iSlice = iSliceTmp;
699 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
700 cudaStreamSynchronize(cudaStreams[iSlice]);
701 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
702 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
703 if (iSlice + 1 < sliceCountLocal)
704 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1]));
709 int iSlice = iSliceTmp - 1;
710 cudaStreamSynchronize(cudaStreams[iSlice]);
712 if (fDebugLevel >= 4)
714 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
715 fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
716 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
719 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
721 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
724 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
726 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
727 #ifdef HLTCA_GPU_TIME_PROFILE
728 QueryPerformanceCounter((LARGE_INTEGER*) &a);
730 fSlaveTrackers[firstSlice + iSlice].WriteOutput();
731 #ifdef HLTCA_GPU_TIME_PROFILE
732 QueryPerformanceCounter((LARGE_INTEGER*) &b);
733 printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
736 if (fDebugLevel >= 4)
738 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
739 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
744 StandalonePerfTime(firstSlice, 10);
746 if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
748 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
749 char* stageAtSync = (char*) malloc(100000000);
750 CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
751 cudaFree(fGpuTracker[0].fStageAtSync);
753 FILE* fp = fopen("profile.txt", "w+");
754 FILE* fp2 = fopen("profile.bmp", "w+b");
755 int nEmptySync = 0, fEmpty;
757 const int bmpheight = 1000;
758 BITMAPFILEHEADER bmpFH;
759 BITMAPINFOHEADER bmpIH;
760 ZeroMemory(&bmpFH, sizeof(bmpFH));
761 ZeroMemory(&bmpIH, sizeof(bmpIH));
763 bmpFH.bfType = 19778; //"BM"
764 bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
765 bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
767 bmpIH.biSize = sizeof(bmpIH);
768 bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
769 bmpIH.biHeight = bmpheight;
771 bmpIH.biBitCount = 32;
773 fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
774 fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
776 for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
779 for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
781 fprintf(fp, "%d\t", stageAtSync[i + j]);
783 if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
784 if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
785 if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
786 if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
787 fwrite(&color, 1, sizeof(int), fp2);
788 if (j > 0 && j % 32 == 0)
790 color = RGB(255, 255, 255);
791 fwrite(&color, 1, 4, fp2);
793 if (stageAtSync[i + j]) fEmpty = 0;
796 if (fEmpty) nEmptySync++;
798 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
809 int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m)
811 //Initialize Slice Tracker Parameter for a slave tracker
812 fSlaveTrackers[iSlice].Initialize(param);
813 if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
815 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
821 int AliHLTTPCCAGPUTracker::ExitGPU()
824 cudaThreadSynchronize();
827 cudaFree(fGPUMemory);
830 if (fHostLockedMemory)
832 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
834 cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
838 cudaFreeHost(fHostLockedMemory);
841 if (CudaFailedMsg(cudaThreadExit()))
843 HLTError("Could not uninitialize GPU");
846 HLTInfo("CUDA Uninitialized");
851 void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
853 fOutputControl = val;
854 for (int i = 0;i < fgkNSlices;i++)
856 fSlaveTrackers[i].SetOutputControl(val);