]>
Commit | Line | Data |
---|---|---|
5ae765b4 | 1 | // ************************************************************************** |
2 | // This file is property of and copyright by the ALICE HLT Project * | |
3 | // ALICE Experiment at CERN, All rights reserved. * | |
4 | // * | |
5 | // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> * | |
6 | // Ivan Kisel <kisel@kip.uni-heidelberg.de> * | |
7 | // David Rohr <drohr@kip.uni-heidelberg.de> * | |
8 | // for The ALICE HLT Project. * | |
9 | // * | |
10 | // Permission to use, copy, modify and distribute this software and its * | |
11 | // documentation strictly for non-commercial purposes is hereby granted * | |
12 | // without fee, provided that the above copyright notice appears in all * | |
13 | // copies and that both the copyright notice and this permission notice * | |
14 | // appear in the supporting documentation. The authors make no claims * | |
15 | // about the suitability of this software for any purpose. It is * | |
16 | // provided "as is" without express or implied warranty. * | |
17 | // * | |
18 | //*************************************************************************** | |
19 | ||
5ae765b4 | 20 | #define FERMI |
21 | #include "AliHLTTPCCAGPUTrackerNVCC.h" | |
d3821846 | 22 | #include "AliHLTTPCCAGPUTrackerCommon.h" |
23 | #define get_global_id(dim) (blockIdx.x * blockDim.x + threadIdx.x) | |
24 | #define get_global_size(dim) (blockDim.x * gridDim.x) | |
25 | #define get_num_groups(dim) (gridDim.x) | |
26 | #define get_local_id(dim) (threadIdx.x) | |
27 | #define get_local_size(dim) (blockDim.x) | |
28 | #define get_group_id(dim) (blockIdx.x) | |
5ae765b4 | 29 | |
5ae765b4 | 30 | #include <cuda.h> |
31 | #include <sm_11_atomic_functions.h> | |
32 | #include <sm_12_atomic_functions.h> | |
5ae765b4 | 33 | |
34 | __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )]; | |
35 | #ifdef HLTCA_GPU_TEXTURE_FETCH | |
36 | texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2; | |
37 | texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu; | |
38 | texture<signed short, 1, cudaReadModeElementType> gAliTexRefs; | |
39 | #endif | |
40 | ||
41 | //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code | |
42 | #include "AliHLTTPCCATrackParam.cxx" | |
43 | #include "AliHLTTPCCATrack.cxx" | |
44 | ||
45 | #include "AliHLTTPCCAHitArea.cxx" | |
46 | #include "AliHLTTPCCAGrid.cxx" | |
47 | #include "AliHLTTPCCARow.cxx" | |
48 | #include "AliHLTTPCCAParam.cxx" | |
49 | #include "AliHLTTPCCATracker.cxx" | |
50 | ||
51 | #include "AliHLTTPCCAProcess.h" | |
52 | ||
53 | #include "AliHLTTPCCATrackletSelector.cxx" | |
54 | #include "AliHLTTPCCANeighboursFinder.cxx" | |
55 | #include "AliHLTTPCCANeighboursCleaner.cxx" | |
56 | #include "AliHLTTPCCAStartHitsFinder.cxx" | |
57 | #include "AliHLTTPCCAStartHitsSorter.cxx" | |
58 | #include "AliHLTTPCCATrackletConstructor.cxx" | |
59 | ||
60 | #ifdef HLTCA_GPU_MERGER | |
61 | #include "AliHLTTPCGMMerger.h" | |
62 | #include "AliHLTTPCGMTrackParam.cxx" | |
63 | #endif | |
64 | ||
5ae765b4 | 65 | ClassImp( AliHLTTPCCAGPUTrackerNVCC ) |
66 | ||
d3821846 | 67 | AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() : fpCudaStreams(NULL) |
5ae765b4 | 68 | { |
69 | fCudaContext = (void*) new CUcontext; | |
70 | }; | |
71 | ||
72 | AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() | |
73 | { | |
74 | delete (CUcontext*) fCudaContext; | |
75 | }; | |
76 | ||
d3821846 | 77 | int AliHLTTPCCAGPUTrackerNVCC::InitGPU_Runtime(int sliceCount, int forceDeviceID) |
5ae765b4 | 78 | { |
79 | //Find best CUDA device, initialize and allocate memory | |
80 | ||
5ae765b4 | 81 | cudaDeviceProp fCudaDeviceProp; |
82 | ||
5ae765b4 | 83 | #ifndef CUDA_DEVICE_EMULATION |
84 | int count, bestDevice = -1; | |
85 | long long int bestDeviceSpeed = 0, deviceSpeed; | |
d3821846 | 86 | if (GPUFailedMsg(cudaGetDeviceCount(&count))) |
5ae765b4 | 87 | { |
88 | HLTError("Error getting CUDA Device Count"); | |
5ae765b4 | 89 | return(1); |
90 | } | |
91 | if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:"); | |
d3821846 | 92 | #if defined(FERMI) || defined(KEPLER) |
5ae765b4 | 93 | const int reqVerMaj = 2; |
94 | const int reqVerMin = 0; | |
95 | #else | |
96 | const int reqVerMaj = 1; | |
97 | const int reqVerMin = 2; | |
98 | #endif | |
99 | for (int i = 0;i < count;i++) | |
100 | { | |
101 | if (fDebugLevel >= 4) printf("Examining device %d\n", i); | |
102 | #if CUDA_VERSION > 3010 | |
103 | size_t free, total; | |
104 | #else | |
105 | unsigned int free, total; | |
106 | #endif | |
107 | cuInit(0); | |
108 | CUdevice tmpDevice; | |
109 | cuDeviceGet(&tmpDevice, i); | |
110 | CUcontext tmpContext; | |
111 | cuCtxCreate(&tmpContext, 0, tmpDevice); | |
112 | if(cuMemGetInfo(&free, &total)) std::cout << "Error\n"; | |
113 | cuCtxDestroy(tmpContext); | |
114 | if (fDebugLevel >= 4) printf("Obtained current memory usage for device %d\n", i); | |
d3821846 | 115 | if (GPUFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue; |
5ae765b4 | 116 | if (fDebugLevel >= 4) printf("Obtained device properties for device %d\n", i); |
117 | int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < reqVerMaj || (fCudaDeviceProp.major == reqVerMaj && fCudaDeviceProp.minor < reqVerMin)) && free >= fGPUMemSize + 100 * 1024 + 1024; | |
118 | #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER | |
119 | //if (sliceCount > fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER) deviceOK = 0; | |
120 | #endif | |
121 | ||
122 | if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %lld / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, (long long int) free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]"); | |
123 | deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free * (long long int) fCudaDeviceProp.major * (long long int) fCudaDeviceProp.major; | |
124 | if (deviceOK && deviceSpeed > bestDeviceSpeed) | |
125 | { | |
126 | bestDevice = i; | |
127 | bestDeviceSpeed = deviceSpeed; | |
128 | } | |
129 | } | |
130 | if (bestDevice == -1) | |
131 | { | |
132 | HLTWarning("No %sCUDA Device available, aborting CUDA Initialisation", count ? "appropriate " : ""); | |
133 | HLTInfo("Requiring Revision %d.%d, Mem: %lld, Multiprocessors: %d", reqVerMaj, reqVerMin, fGPUMemSize + 100 * 1024 * 1024, sliceCount); | |
5ae765b4 | 134 | return(1); |
135 | } | |
136 | ||
137 | if (forceDeviceID == -1) | |
138 | fCudaDevice = bestDevice; | |
139 | else | |
140 | fCudaDevice = forceDeviceID; | |
141 | #else | |
142 | fCudaDevice = 0; | |
143 | #endif | |
144 | ||
145 | cudaGetDeviceProperties(&fCudaDeviceProp ,fCudaDevice ); | |
146 | ||
147 | if (fDebugLevel >= 1) | |
148 | { | |
149 | HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name); | |
150 | HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem); | |
151 | HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock); | |
152 | HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock); | |
153 | HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize); | |
154 | HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch); | |
155 | HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock); | |
156 | HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]); | |
157 | HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]); | |
158 | HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem); | |
159 | HLTInfo("major = %d", fCudaDeviceProp.major); | |
160 | HLTInfo("minor = %d", fCudaDeviceProp.minor); | |
161 | HLTInfo("clockRate = %d", fCudaDeviceProp.clockRate); | |
162 | HLTInfo("memoryClockRate = %d", fCudaDeviceProp.memoryClockRate); | |
163 | HLTInfo("multiProcessorCount = %d", fCudaDeviceProp.multiProcessorCount); | |
164 | HLTInfo("textureAlignment = %lld", (unsigned long long int) fCudaDeviceProp.textureAlignment); | |
165 | } | |
166 | fConstructorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER; | |
167 | selectorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_SELECTOR_MULTIPLIER; | |
168 | ||
169 | if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) | |
170 | { | |
171 | HLTError( "Unsupported CUDA Device" ); | |
5ae765b4 | 172 | return(1); |
173 | } | |
174 | ||
175 | if (cuCtxCreate((CUcontext*) fCudaContext, CU_CTX_SCHED_AUTO, fCudaDevice) != CUDA_SUCCESS) | |
176 | { | |
177 | HLTError("Could not set CUDA Device!"); | |
5ae765b4 | 178 | return(1); |
179 | } | |
180 | ||
d3821846 | 181 | if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || GPUFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize))) |
5ae765b4 | 182 | { |
183 | HLTError("CUDA Memory Allocation Error"); | |
184 | cudaThreadExit(); | |
5ae765b4 | 185 | return(1); |
186 | } | |
187 | fGPUMergerMemory = ((char*) fGPUMemory) + fGPUMemSize - fGPUMergerMaxMemory; | |
5ae765b4 | 188 | if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize); |
189 | int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY; | |
190 | #ifdef HLTCA_GPU_MERGER | |
191 | hostMemSize += fGPUMergerMaxMemory; | |
192 | #endif | |
d3821846 | 193 | if (GPUFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize))) |
5ae765b4 | 194 | { |
195 | cudaFree(fGPUMemory); | |
196 | cudaThreadExit(); | |
197 | HLTError("Error allocating Page Locked Host Memory"); | |
198 | return(1); | |
199 | } | |
200 | if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize); | |
201 | fGPUMergerHostMemory = ((char*) fHostLockedMemory) + hostMemSize - fGPUMergerMaxMemory; | |
202 | ||
203 | if (fDebugLevel >= 1) | |
204 | { | |
d3821846 | 205 | GPUFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize)); |
5ae765b4 | 206 | } |
207 | ||
208 | fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t)); | |
209 | cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams; | |
210 | for (int i = 0;i < CAMath::Max(3, fSliceCount);i++) | |
211 | { | |
d3821846 | 212 | if (GPUFailedMsg(cudaStreamCreate(&cudaStreams[i]))) |
5ae765b4 | 213 | { |
214 | cudaFree(fGPUMemory); | |
215 | cudaFreeHost(fHostLockedMemory); | |
216 | cudaThreadExit(); | |
217 | HLTError("Error creating CUDA Stream"); | |
218 | return(1); | |
219 | } | |
220 | } | |
221 | ||
5ae765b4 | 222 | cuCtxPopCurrent((CUcontext*) fCudaContext); |
5ae765b4 | 223 | HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", fCudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount); |
224 | ||
5ae765b4 | 225 | return(0); |
226 | } | |
227 | ||
d3821846 | 228 | bool AliHLTTPCCAGPUTrackerNVCC::GPUFailedMsgA(cudaError_t error, const char* file, int line) |
5ae765b4 | 229 | { |
230 | //Check for CUDA Error and in the case of an error display the corresponding error string | |
231 | if (error == cudaSuccess) return(false); | |
232 | HLTWarning("CUDA Error: %d / %s (%s:%d)", error, cudaGetErrorString(error), file, line); | |
233 | return(true); | |
234 | } | |
235 | ||
d3821846 | 236 | int AliHLTTPCCAGPUTrackerNVCC::GPUSync(char* state, int stream, int slice) |
5ae765b4 | 237 | { |
238 | //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards | |
239 | ||
240 | if (fDebugLevel == 0) return(0); | |
241 | cudaError cuErr; | |
242 | cuErr = cudaGetLastError(); | |
243 | if (cuErr != cudaSuccess) | |
244 | { | |
d3821846 | 245 | HLTError("Cuda Error %s while running kernel (%s) (Stream %d; %d/%d)", cudaGetErrorString(cuErr), state, stream, slice, fgkNSlices); |
5ae765b4 | 246 | return(1); |
247 | } | |
d3821846 | 248 | if (GPUFailedMsg(cudaThreadSynchronize())) |
5ae765b4 | 249 | { |
d3821846 | 250 | HLTError("CUDA Error while synchronizing (%s) (Stream %d; %d/%d)", state, stream, slice, fgkNSlices); |
5ae765b4 | 251 | return(1); |
252 | } | |
253 | if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done"); | |
254 | return(0); | |
255 | } | |
256 | ||
5ae765b4 | 257 | #if defined(BITWISE_COMPATIBLE_DEBUG_OUTPUT) || defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER) |
258 | void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker*, int, bool) {} | |
259 | #else | |
260 | void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check) | |
261 | { | |
262 | //Dump Rowblocks to File | |
263 | if (fDebugLevel >= 4) | |
264 | { | |
265 | *fOutFile << "RowBlock Tracklets (Slice" << tracker[iSlice].Param().ISlice() << " (" << iSlice << " of reco))"; | |
266 | *fOutFile << " after Tracklet Reconstruction"; | |
267 | *fOutFile << std::endl; | |
268 | ||
269 | int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2); | |
270 | int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2); | |
271 | uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * fConstructorBlockCount); | |
d3821846 | 272 | GPUFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost)); |
273 | GPUFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost)); | |
274 | GPUFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * fConstructorBlockCount, cudaMemcpyDeviceToHost)); | |
275 | GPUFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); | |
5ae765b4 | 276 | |
277 | int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet; | |
278 | for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++) | |
279 | { | |
d3821846 | 280 | *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << |
281 | rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: "; | |
282 | for (int j = 0;j < rowBlockPos[i].x;j++) | |
283 | { | |
284 | //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem! | |
285 | *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", "; | |
286 | #ifdef HLTCA_GPU_SCHED_FIXED_START | |
287 | if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k) | |
288 | { | |
289 | HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k); | |
290 | } | |
291 | #endif //HLTCA_GPU_SCHED_FIXED_START | |
292 | k++; | |
293 | if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1) | |
294 | { | |
295 | HLTError("Error, -1 Tracklet found"); | |
296 | } | |
297 | } | |
298 | *fOutFile << std::endl << "Phase 2: "; | |
299 | for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++) | |
300 | { | |
301 | *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", "; | |
302 | } | |
303 | *fOutFile << std::endl; | |
5ae765b4 | 304 | } |
305 | ||
d3821846 | 306 | if (check) |
5ae765b4 | 307 | { |
d3821846 | 308 | *fOutFile << "Starting Threads: (Slice" << tracker[iSlice].Param().ISlice() << ", First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl; |
309 | for (int i = 0;i < fConstructorBlockCount;i++) | |
310 | { | |
311 | *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl; | |
312 | } | |
5ae765b4 | 313 | } |
314 | ||
d3821846 | 315 | free(rowBlockPos); |
316 | free(rowBlockTracklets); | |
317 | free(blockStartingTracklet); | |
5ae765b4 | 318 | } |
d3821846 | 319 | } |
320 | #endif | |
321 | ||
322 | __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits) | |
323 | { | |
324 | //Initialize GPU RowBlocks and HitWeights | |
325 | int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights; | |
326 | const int stride = get_global_size(0); | |
327 | int4 i0; | |
328 | i0.x = i0.y = i0.z = i0.w = 0; | |
329 | #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER | |
330 | int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets; | |
331 | int4 i1; | |
332 | i1.x = i1.y = i1.z = i1.w = -1; | |
333 | for (int i = get_global_id(0);i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride) | |
334 | RowBlockPos[i] = i0; | |
335 | for (int i = get_global_id(0);i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride) | |
336 | rowBlockTracklets4[i] = i1; | |
337 | #endif | |
338 | for (int i = get_global_id(0);i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride) | |
339 | sliceDataHitWeights4[i] = i0; | |
340 | } | |
341 | ||
342 | int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal) | |
343 | { | |
344 | //Primary reconstruction function | |
345 | ||
346 | cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams; | |
347 | ||
348 | if (Reconstruct_Base_Init(pOutput, pClusterData, firstSlice, sliceCountLocal)) return(1); | |
5ae765b4 | 349 | |
350 | #ifdef HLTCA_GPU_TEXTURE_FETCH | |
351 | cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>(); | |
352 | size_t offset; | |
d3821846 | 353 | if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR) |
5ae765b4 | 354 | { |
355 | HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset); | |
5ae765b4 | 356 | ResetHelperThreads(0); |
357 | return(1); | |
358 | } | |
359 | cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>(); | |
d3821846 | 360 | if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR) |
5ae765b4 | 361 | { |
362 | HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset); | |
5ae765b4 | 363 | ResetHelperThreads(0); |
364 | return(1); | |
365 | } | |
366 | cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>(); | |
d3821846 | 367 | if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR) |
5ae765b4 | 368 | { |
369 | HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset); | |
5ae765b4 | 370 | ResetHelperThreads(0); |
371 | return(1); | |
372 | } | |
373 | #endif | |
374 | ||
375 | //Copy Tracker Object to GPU Memory | |
376 | if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU"); | |
377 | #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE | |
378 | char* tmpMem; | |
d3821846 | 379 | if (GPUFailedMsg(cudaMalloc(&tmpMem, 100000000))) |
5ae765b4 | 380 | { |
381 | HLTError("Error allocating CUDA profile memory"); | |
5ae765b4 | 382 | ResetHelperThreads(0); |
383 | return(1); | |
384 | } | |
385 | fGpuTracker[0].fStageAtSync = tmpMem; | |
d3821846 | 386 | GPUFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000)); |
5ae765b4 | 387 | #endif |
d3821846 | 388 | GPUFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0])); |
389 | if (GPUSync("Initialization (1)", 0, firstSlice) RANDOM_ERROR) | |
5ae765b4 | 390 | { |
5ae765b4 | 391 | ResetHelperThreads(0); |
392 | return(1); | |
393 | } | |
394 | ||
5ae765b4 | 395 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) |
396 | { | |
d3821846 | 397 | if (Reconstruct_Base_SliceInit(pClusterData, iSlice, firstSlice)) return(1); |
5ae765b4 | 398 | |
399 | //Initialize temporary memory where needed | |
400 | if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory"); | |
401 | PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign()); | |
d3821846 | 402 | if (GPUSync("Initialization (2)", 2, iSlice + firstSlice) RANDOM_ERROR) |
5ae765b4 | 403 | { |
5ae765b4 | 404 | ResetHelperThreads(1); |
405 | return(1); | |
406 | } | |
407 | ||
408 | //Copy Data to GPU Global Memory | |
d3821846 | 409 | GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); |
410 | GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); | |
411 | GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); | |
5ae765b4 | 412 | |
413 | if (fDebugLevel >= 4) | |
414 | { | |
415 | if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory"); | |
416 | fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount); | |
417 | fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() ); | |
418 | } | |
419 | ||
d3821846 | 420 | if (GPUSync("Initialization (3)", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) |
5ae765b4 | 421 | { |
5ae765b4 | 422 | ResetHelperThreads(1); |
423 | return(1); | |
424 | } | |
425 | StandalonePerfTime(firstSlice + iSlice, 1); | |
426 | ||
427 | if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal); | |
428 | AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), HLTCA_GPU_THREAD_COUNT_FINDER, 0, cudaStreams[iSlice & 1]>>>(iSlice); | |
429 | ||
d3821846 | 430 | if (GPUSync("Neighbours finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) |
5ae765b4 | 431 | { |
5ae765b4 | 432 | ResetHelperThreads(1); |
433 | return(1); | |
434 | } | |
435 | ||
436 | StandalonePerfTime(firstSlice + iSlice, 2); | |
437 | ||
438 | if (fDebugLevel >= 4) | |
439 | { | |
d3821846 | 440 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost)); |
5ae765b4 | 441 | if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile); |
442 | } | |
443 | ||
444 | if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal); | |
445 | AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice); | |
d3821846 | 446 | if (GPUSync("Neighbours Cleaner", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) |
5ae765b4 | 447 | { |
5ae765b4 | 448 | ResetHelperThreads(1); |
449 | return(1); | |
450 | } | |
451 | ||
452 | StandalonePerfTime(firstSlice + iSlice, 3); | |
453 | ||
454 | if (fDebugLevel >= 4) | |
455 | { | |
d3821846 | 456 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost)); |
5ae765b4 | 457 | if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile); |
458 | } | |
459 | ||
460 | if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal); | |
461 | AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice); | |
d3821846 | 462 | if (GPUSync("Start Hits Finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) |
5ae765b4 | 463 | { |
5ae765b4 | 464 | ResetHelperThreads(1); |
465 | return(1); | |
466 | } | |
467 | ||
468 | StandalonePerfTime(firstSlice + iSlice, 4); | |
469 | ||
470 | if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal); | |
471 | AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice); | |
d3821846 | 472 | if (GPUSync("Start Hits Sorter", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) |
5ae765b4 | 473 | { |
5ae765b4 | 474 | ResetHelperThreads(1); |
475 | return(1); | |
476 | } | |
477 | ||
478 | StandalonePerfTime(firstSlice + iSlice, 5); | |
479 | ||
480 | if (fDebugLevel >= 2) | |
481 | { | |
d3821846 | 482 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); |
5ae765b4 | 483 | if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice); |
484 | if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS RANDOM_ERROR) | |
485 | { | |
486 | HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant"); | |
5ae765b4 | 487 | ResetHelperThreads(1); |
488 | return(1); | |
489 | } | |
490 | } | |
491 | ||
492 | if (fDebugLevel >= 4 && *fSlaveTrackers[firstSlice + iSlice].NTracklets()) | |
493 | { | |
494 | #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT | |
d3821846 | 495 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost)); |
5ae765b4 | 496 | if (fDebugMask & 8) |
497 | { | |
498 | *fOutFile << "Temporary "; | |
499 | fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile); | |
500 | } | |
501 | uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows()); | |
d3821846 | 502 | GPUFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost)); |
5ae765b4 | 503 | if (fDebugMask & 16) |
504 | { | |
505 | *fOutFile << "Start Hits Sort Vector:" << std::endl; | |
506 | for (int i = 1;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 5;i++) | |
507 | { | |
508 | *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl; | |
509 | } | |
510 | } | |
511 | free(tmpMemory); | |
512 | #endif | |
513 | ||
d3821846 | 514 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost)); |
5ae765b4 | 515 | if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile); |
516 | } | |
517 | ||
518 | StandalonePerfTime(firstSlice + iSlice, 6); | |
519 | ||
520 | fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters()); | |
521 | } | |
522 | ||
523 | for (int i = 0;i < fNHelperThreads;i++) | |
524 | { | |
525 | pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]); | |
526 | } | |
527 | ||
528 | StandalonePerfTime(firstSlice, 7); | |
529 | ||
530 | #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER | |
531 | int nHardCollisions = 0; | |
532 | ||
533 | RestartTrackletConstructor: | |
534 | if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler"); | |
535 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
536 | { | |
537 | AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice); | |
d3821846 | 538 | if (GPUSync("Tracklet Initializer", -1, iSlice + firstSlice) RANDOM_ERROR) |
5ae765b4 | 539 | { |
540 | cudaThreadSynchronize(); | |
541 | cuCtxPopCurrent((CUcontext*) fCudaContext); | |
542 | return(1); | |
543 | } | |
544 | if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice); | |
545 | } | |
546 | #endif | |
547 | ||
548 | if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor"); | |
549 | AliHLTTPCCATrackletConstructorGPU<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>(); | |
d3821846 | 550 | if (GPUSync("Tracklet Constructor", -1, firstSlice) RANDOM_ERROR) |
5ae765b4 | 551 | { |
552 | cudaThreadSynchronize(); | |
553 | cuCtxPopCurrent((CUcontext*) fCudaContext); | |
554 | return(1); | |
555 | } | |
556 | ||
557 | StandalonePerfTime(firstSlice, 8); | |
558 | ||
559 | if (fDebugLevel >= 4) | |
560 | { | |
561 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
562 | { | |
563 | if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false); | |
d3821846 | 564 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); |
5ae765b4 | 565 | if (fDebugLevel >= 5) |
566 | { | |
567 | HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets()); | |
568 | } | |
d3821846 | 569 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost)); |
570 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost)); | |
5ae765b4 | 571 | if (0 && fSlaveTrackers[firstSlice + iSlice].NTracklets() && fSlaveTrackers[firstSlice + iSlice].Tracklet(0).NHits() < 0) |
572 | { | |
573 | cudaThreadSynchronize(); | |
574 | cuCtxPopCurrent((CUcontext*) fCudaContext); | |
575 | printf("INTERNAL ERROR\n"); | |
576 | return(1); | |
577 | } | |
578 | if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile); | |
579 | } | |
580 | } | |
581 | ||
582 | int runSlices = 0; | |
583 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices) | |
584 | { | |
585 | if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++; | |
586 | if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices); | |
587 | AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice)); | |
d3821846 | 588 | if (GPUSync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR) |
5ae765b4 | 589 | { |
590 | cudaThreadSynchronize(); | |
591 | cuCtxPopCurrent((CUcontext*) fCudaContext); | |
592 | return(1); | |
593 | } | |
594 | } | |
595 | StandalonePerfTime(firstSlice, 9); | |
596 | ||
597 | char *tmpMemoryGlobalTracking = NULL; | |
598 | fSliceOutputReady = 0; | |
d3821846 | 599 | |
600 | if (Reconstruct_Base_StartGlobal(pOutput, tmpMemoryGlobalTracking)) return(1); | |
5ae765b4 | 601 | |
602 | int tmpSlice = 0, tmpSlice2 = 0; | |
603 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
604 | { | |
605 | if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host"); | |
606 | ||
607 | while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == (cudaError_t) CUDA_SUCCESS)) | |
608 | { | |
d3821846 | 609 | if (GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice])) RANDOM_ERROR) |
5ae765b4 | 610 | { |
611 | ResetHelperThreads(1); | |
d3821846 | 612 | ActivateThreadContext(); |
5ae765b4 | 613 | return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal)); |
614 | } | |
615 | tmpSlice++; | |
616 | } | |
617 | ||
618 | while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == (cudaError_t) CUDA_SUCCESS) | |
619 | { | |
d3821846 | 620 | if (*fSlaveTrackers[firstSlice + tmpSlice2].NTracks() > 0) |
621 | { | |
622 | GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2])); | |
623 | GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2])); | |
624 | } | |
5ae765b4 | 625 | tmpSlice2++; |
626 | } | |
627 | ||
d3821846 | 628 | if (GPUFailedMsg(cudaStreamSynchronize(cudaStreams[iSlice])) RANDOM_ERROR) |
5ae765b4 | 629 | { |
630 | ResetHelperThreads(1); | |
d3821846 | 631 | ActivateThreadContext(); |
5ae765b4 | 632 | return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal)); |
633 | } | |
634 | ||
635 | if (fDebugLevel >= 4) | |
636 | { | |
d3821846 | 637 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost)); |
5ae765b4 | 638 | #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT |
639 | if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile); | |
640 | #endif | |
641 | if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile); | |
642 | } | |
643 | ||
644 | if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError RANDOM_ERROR) | |
645 | { | |
646 | #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER | |
647 | if ((fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION || fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_WRONG_ROW)&& nHardCollisions++ < 10) | |
648 | { | |
649 | if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION) | |
650 | { | |
651 | HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor (Slice %d)", firstSlice + iSlice); | |
652 | } | |
653 | else | |
654 | { | |
655 | HLTWarning("Tracklet Constructor returned invalid row (Slice %d)", firstSlice + iSlice); | |
656 | } | |
657 | if (fDebugLevel >= 4) | |
658 | { | |
659 | ResetHelperThreads(1); | |
5ae765b4 | 660 | return(1); |
661 | } | |
662 | for (int i = 0;i < sliceCountLocal;i++) | |
663 | { | |
664 | cudaThreadSynchronize(); | |
d3821846 | 665 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost)); |
5ae765b4 | 666 | *fSlaveTrackers[firstSlice + i].NTracks() = 0; |
667 | *fSlaveTrackers[firstSlice + i].NTrackHits() = 0; | |
668 | fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE; | |
d3821846 | 669 | GPUFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice)); |
5ae765b4 | 670 | PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign()); |
671 | } | |
672 | goto RestartTrackletConstructor; | |
673 | } | |
674 | #endif | |
675 | HLTError("GPU Tracker returned Error Code %d in slice %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError, firstSlice + iSlice); | |
5ae765b4 | 676 | ResetHelperThreads(1); |
677 | return(1); | |
678 | } | |
679 | if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits()); | |
680 | ||
d3821846 | 681 | if (Reconstruct_Base_FinishSlices(pOutput, iSlice, firstSlice)) return(1); |
5ae765b4 | 682 | |
683 | if (fDebugLevel >= 4) | |
684 | { | |
685 | delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory(); | |
686 | delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory(); | |
687 | } | |
688 | } | |
689 | ||
d3821846 | 690 | if (Reconstruct_Base_Finalize(pOutput, tmpMemoryGlobalTracking, firstSlice)) return(1); |
5ae765b4 | 691 | |
692 | /*for (int i = firstSlice;i < firstSlice + sliceCountLocal;i++) | |
693 | { | |
694 | fSlaveTrackers[i].DumpOutput(stdout); | |
695 | }*/ | |
696 | ||
697 | /*static int runnum = 0; | |
698 | std::ofstream tmpOut; | |
699 | char buffer[1024]; | |
700 | sprintf(buffer, "GPUtracks%d.out", runnum++); | |
701 | tmpOut.open(buffer); | |
702 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
703 | { | |
704 | fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(tmpOut); | |
705 | } | |
706 | tmpOut.close();*/ | |
707 | ||
708 | #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE | |
709 | char* stageAtSync = (char*) malloc(100000000); | |
d3821846 | 710 | GPUFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost)); |
5ae765b4 | 711 | cudaFree(fGpuTracker[0].StageAtSync()); |
712 | ||
713 | FILE* fp = fopen("profile.txt", "w+"); | |
714 | FILE* fp2 = fopen("profile.bmp", "w+b"); | |
715 | int nEmptySync = 0, fEmpty; | |
716 | ||
717 | const int bmpheight = 8192; | |
718 | BITMAPFILEHEADER bmpFH; | |
719 | BITMAPINFOHEADER bmpIH; | |
720 | ZeroMemory(&bmpFH, sizeof(bmpFH)); | |
721 | ZeroMemory(&bmpIH, sizeof(bmpIH)); | |
722 | ||
723 | bmpFH.bfType = 19778; //"BM" | |
724 | bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1) * bmpheight ; | |
725 | bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH); | |
726 | ||
727 | bmpIH.biSize = sizeof(bmpIH); | |
728 | bmpIH.biWidth = fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1; | |
729 | bmpIH.biHeight = bmpheight; | |
730 | bmpIH.biPlanes = 1; | |
731 | bmpIH.biBitCount = 32; | |
732 | ||
733 | fwrite(&bmpFH, 1, sizeof(bmpFH), fp2); | |
734 | fwrite(&bmpIH, 1, sizeof(bmpIH), fp2); | |
735 | ||
736 | for (int i = 0;i < bmpheight * fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;i += fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR) | |
737 | { | |
738 | fEmpty = 1; | |
739 | for (int j = 0;j < fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;j++) | |
740 | { | |
741 | fprintf(fp, "%d\t", stageAtSync[i + j]); | |
742 | int color = 0; | |
743 | if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0); | |
744 | if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0); | |
745 | if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255); | |
746 | if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0); | |
747 | fwrite(&color, 1, sizeof(int), fp2); | |
748 | if (j > 0 && j % 32 == 0) | |
749 | { | |
750 | color = RGB(255, 255, 255); | |
751 | fwrite(&color, 1, 4, fp2); | |
752 | } | |
753 | if (stageAtSync[i + j]) fEmpty = 0; | |
754 | } | |
755 | fprintf(fp, "\n"); | |
756 | if (fEmpty) nEmptySync++; | |
757 | else nEmptySync = 0; | |
758 | //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break; | |
759 | } | |
760 | ||
761 | fclose(fp); | |
762 | fclose(fp2); | |
763 | free(stageAtSync); | |
764 | #endif | |
765 | ||
766 | cuCtxPopCurrent((CUcontext*) fCudaContext); | |
767 | return(0); | |
768 | } | |
769 | ||
770 | __global__ void ClearPPHitWeights(int sliceCount) | |
771 | { | |
772 | //Clear HitWeights | |
773 | ||
774 | for (int k = 0;k < sliceCount;k++) | |
775 | { | |
776 | AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k]; | |
777 | int4* const pHitWeights = (int4*) tracker.Data().HitWeights(); | |
778 | const int dwCount = tracker.Data().NumberOfHitsPlusAlign(); | |
d3821846 | 779 | const int stride = get_global_size(0); |
5ae765b4 | 780 | int4 i0; |
781 | i0.x = i0.y = i0.z = i0.w = 0; | |
782 | ||
d3821846 | 783 | for (int i = get_global_id(0);i < dwCount * sizeof(int) / sizeof(int4);i += stride) |
5ae765b4 | 784 | { |
785 | pHitWeights[i] = i0; | |
786 | } | |
787 | } | |
788 | } | |
789 | ||
790 | int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal) | |
791 | { | |
792 | //Primary reconstruction function for small events (PP) | |
793 | ||
794 | memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal); | |
795 | ||
796 | if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants"); | |
797 | ||
798 | char* tmpSliceMemHost = (char*) SliceDataMemory(fHostLockedMemory, 0); | |
799 | char* tmpSliceMemGpu = (char*) SliceDataMemory(fGPUMemory, 0); | |
800 | ||
801 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
802 | { | |
803 | StandalonePerfTime(firstSlice + iSlice, 0); | |
804 | ||
805 | //Initialize GPU Slave Tracker | |
806 | if (fDebugLevel >= 3) HLTInfo("Creating Slice Data"); | |
807 | fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(tmpSliceMemHost, RowMemory(fHostLockedMemory, firstSlice + iSlice)); | |
808 | fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]); | |
809 | if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY) | |
810 | { | |
811 | HLTError("Insufficiant Slice Data Memory"); | |
812 | return(1); | |
813 | } | |
814 | ||
815 | //Make this a GPU Tracker | |
816 | fGpuTracker[iSlice].SetGPUTracker(); | |
817 | fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice)); | |
818 | ||
5ae765b4 | 819 | fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice)); |
820 | fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false); | |
821 | ||
822 | tmpSliceMemHost += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(); | |
823 | tmpSliceMemHost = alignPointer(tmpSliceMemHost, 64 * 1024); | |
824 | tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(); | |
825 | tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024); | |
826 | ||
5ae765b4 | 827 | //Set Pointers to GPU Memory |
828 | char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice); | |
829 | ||
830 | if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory"); | |
831 | tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters()); | |
832 | tmpMem = alignPointer(tmpMem, 64 * 1024); | |
833 | ||
834 | if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory"); | |
835 | tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount); | |
836 | tmpMem = alignPointer(tmpMem, 64 * 1024); | |
837 | ||
838 | if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory"); | |
839 | tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters()); | |
840 | tmpMem = alignPointer(tmpMem, 64 * 1024); | |
841 | ||
842 | if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY) | |
843 | { | |
844 | HLTError("Insufficiant Track Memory"); | |
845 | return(1); | |
846 | } | |
847 | ||
848 | if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY) | |
849 | { | |
850 | HLTError("Insufficiant Global Memory"); | |
851 | return(1); | |
852 | } | |
853 | ||
854 | //Initialize Startup Constants | |
855 | *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0; | |
856 | *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0; | |
857 | *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0; | |
858 | fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0; | |
859 | ||
860 | fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory()); | |
861 | ||
d3821846 | 862 | if (GPUSync("Initialization", -1, iSlice + firstSlice)) return(1); |
5ae765b4 | 863 | StandalonePerfTime(firstSlice + iSlice, 1); |
864 | } | |
865 | ||
866 | #ifdef HLTCA_GPU_TEXTURE_FETCH | |
867 | cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>(); | |
868 | size_t offset; | |
d3821846 | 869 | if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) |
5ae765b4 | 870 | { |
871 | HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset); | |
872 | return(1); | |
873 | } | |
874 | cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>(); | |
d3821846 | 875 | if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) |
5ae765b4 | 876 | { |
877 | HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset); | |
878 | return(1); | |
879 | } | |
880 | cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>(); | |
d3821846 | 881 | if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) |
5ae765b4 | 882 | { |
883 | HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset); | |
884 | return(1); | |
885 | } | |
886 | #endif | |
887 | ||
888 | //Copy Tracker Object to GPU Memory | |
889 | if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU"); | |
d3821846 | 890 | GPUFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice)); |
5ae765b4 | 891 | |
892 | //Copy Data to GPU Global Memory | |
893 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
894 | { | |
d3821846 | 895 | GPUFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice)); |
5ae765b4 | 896 | //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize()); |
897 | } | |
d3821846 | 898 | //GPUFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice)); |
5ae765b4 | 899 | //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0))); |
d3821846 | 900 | GPUFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice)); |
901 | GPUFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice)); | |
5ae765b4 | 902 | |
903 | if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder"); | |
904 | AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_FINDER>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows()); | |
d3821846 | 905 | if (GPUSync("Neighbours finder", -1, firstSlice)) return 1; |
5ae765b4 | 906 | StandalonePerfTime(firstSlice, 2); |
907 | if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner"); | |
908 | AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2); | |
d3821846 | 909 | if (GPUSync("Neighbours Cleaner", -1, firstSlice)) return 1; |
5ae765b4 | 910 | StandalonePerfTime(firstSlice, 3); |
911 | if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder"); | |
912 | AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6); | |
d3821846 | 913 | if (GPUSync("Start Hits Finder", -1, firstSlice)) return 1; |
5ae765b4 | 914 | StandalonePerfTime(firstSlice, 4); |
915 | ||
916 | ClearPPHitWeights <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(sliceCountLocal); | |
d3821846 | 917 | if (GPUSync("Clear Hit Weights", -1, firstSlice)) return 1; |
5ae765b4 | 918 | |
919 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
920 | { | |
921 | fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters()); | |
922 | } | |
923 | ||
924 | StandalonePerfTime(firstSlice, 7); | |
925 | ||
926 | if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor"); | |
927 | AliHLTTPCCATrackletConstructorGPUPP<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>(0, sliceCountLocal); | |
d3821846 | 928 | if (GPUSync("Tracklet Constructor PP", -1, firstSlice)) return 1; |
5ae765b4 | 929 | |
930 | StandalonePerfTime(firstSlice, 8); | |
931 | ||
932 | AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR>>>(0, sliceCountLocal); | |
d3821846 | 933 | if (GPUSync("Tracklet Selector", -1, firstSlice)) return 1; |
5ae765b4 | 934 | StandalonePerfTime(firstSlice, 9); |
935 | ||
d3821846 | 936 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost)); |
5ae765b4 | 937 | |
938 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
939 | { | |
940 | if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host"); | |
941 | ||
d3821846 | 942 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost)); |
943 | GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost)); | |
5ae765b4 | 944 | |
945 | if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError) | |
946 | { | |
947 | HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError); | |
948 | return(1); | |
949 | } | |
950 | if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits()); | |
951 | fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks; | |
952 | fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits; | |
953 | } | |
954 | ||
955 | if (fGlobalTracking && sliceCountLocal == fgkNSlices) | |
956 | { | |
957 | char tmpMemory[sizeof(AliHLTTPCCATracklet) | |
958 | #ifdef EXTERN_ROW_HITS | |
959 | + HLTCA_ROW_COUNT * sizeof(int) | |
960 | #endif | |
961 | + 16]; | |
962 | ||
963 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
964 | { | |
965 | if (fSlaveTrackers[iSlice].CommonMemory()->fNTracklets) | |
966 | { | |
967 | HLTError("Slave tracker tracklets found where none expected, memory not freed!\n"); | |
968 | } | |
969 | fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(&tmpMemory[0], 1, fConstructorBlockCount); | |
970 | fSlaveTrackers[iSlice].CommonMemory()->fNTracklets = 1; | |
971 | } | |
972 | ||
973 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
974 | { | |
975 | int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2); | |
976 | int sliceRight = (iSlice + 1) % (fgkNSlices / 2); | |
977 | if (iSlice >= fgkNSlices / 2) | |
978 | { | |
979 | sliceLeft += fgkNSlices / 2; | |
980 | sliceRight += fgkNSlices / 2; | |
981 | } | |
982 | fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS); | |
983 | } | |
984 | ||
985 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
986 | { | |
987 | printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits); | |
988 | } | |
989 | } | |
990 | ||
991 | for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) | |
992 | { | |
993 | fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]); | |
994 | fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare(); | |
995 | fSlaveTrackers[firstSlice + iSlice].WriteOutput(); | |
996 | } | |
997 | ||
998 | StandalonePerfTime(firstSlice, 10); | |
999 | ||
1000 | if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished"); | |
1001 | ||
1002 | return(0); | |
1003 | } | |
1004 | ||
d3821846 | 1005 | int AliHLTTPCCAGPUTrackerNVCC::ExitGPU_Runtime() |
5ae765b4 | 1006 | { |
1007 | //Uninitialize CUDA | |
1008 | cuCtxPushCurrent(*((CUcontext*) fCudaContext)); | |
1009 | ||
1010 | cudaThreadSynchronize(); | |
1011 | if (fGPUMemory) | |
1012 | { | |
1013 | cudaFree(fGPUMemory); | |
1014 | fGPUMemory = NULL; | |
1015 | } | |
1016 | if (fHostLockedMemory) | |
1017 | { | |
1018 | for (int i = 0;i < CAMath::Max(3, fSliceCount);i++) | |
1019 | { | |
1020 | cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]); | |
1021 | } | |
1022 | free(fpCudaStreams); | |
1023 | fGpuTracker = NULL; | |
1024 | cudaFreeHost(fHostLockedMemory); | |
1025 | } | |
1026 | ||
d3821846 | 1027 | if (GPUFailedMsg(cudaThreadExit())) |
5ae765b4 | 1028 | { |
1029 | HLTError("Could not uninitialize GPU"); | |
1030 | return(1); | |
1031 | } | |
1032 | ||
5ae765b4 | 1033 | cuCtxDestroy(*((CUcontext*) fCudaContext)); |
1034 | ||
1035 | cudaDeviceReset(); | |
1036 | ||
1037 | HLTInfo("CUDA Uninitialized"); | |
1038 | fCudaInitialized = 0; | |
1039 | return(0); | |
1040 | } | |
1041 | ||
5ae765b4 | 1042 | int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger) |
1043 | { | |
1044 | #ifndef HLTCA_GPU_MERGER | |
1045 | HLTError("HLTCA_GPU_MERGER compile flag not set"); | |
1046 | return(1); | |
1047 | #else | |
1048 | if (!fCudaInitialized) | |
1049 | { | |
1050 | HLTError("GPU Merger not initialized"); | |
1051 | return(1); | |
1052 | } | |
1053 | unsigned long long int a, b, c, d, e; | |
1054 | AliHLTTPCCATracker::StandaloneQueryFreq(&e); | |
1055 | ||
1056 | char* gpumem = (char*) fGPUMergerMemory; | |
1057 | float *X, *Y, *Z, *Angle; | |
1058 | unsigned int *RowType; | |
1059 | AliHLTTPCGMMergedTrack* tracks; | |
1060 | float* field; | |
1061 | AliHLTTPCCAParam* param; | |
1062 | ||
1063 | gpumem = alignPointer(gpumem, 1024 * 1024); | |
1064 | ||
1065 | AssignMemory(X, gpumem, Merger->NClusters()); | |
1066 | AssignMemory(Y, gpumem, Merger->NClusters()); | |
1067 | AssignMemory(Z, gpumem, Merger->NClusters()); | |
1068 | AssignMemory(Angle, gpumem, Merger->NClusters()); | |
1069 | AssignMemory(RowType, gpumem, Merger->NClusters()); | |
1070 | AssignMemory(tracks, gpumem, Merger->NOutputTracks()); | |
1071 | AssignMemory(field, gpumem, 6); | |
1072 | AssignMemory(param, gpumem, 1); | |
1073 | ||
5ae765b4 | 1074 | if ((size_t) (gpumem - (char*) fGPUMergerMemory) > (size_t) fGPUMergerMaxMemory) |
1075 | { | |
1076 | HLTError("Insufficiant GPU Merger Memory"); | |
1077 | } | |
1078 | ||
1079 | cuCtxPushCurrent(*((CUcontext*) fCudaContext)); | |
1080 | ||
1081 | if (fDebugLevel >= 2) HLTInfo("Running GPU Merger (%d/%d)", Merger->NOutputTrackClusters(), Merger->NClusters()); | |
1082 | AliHLTTPCCATracker::StandaloneQueryTime(&a); | |
d3821846 | 1083 | GPUFailedMsg(cudaMemcpy(X, Merger->ClusterX(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); |
1084 | GPUFailedMsg(cudaMemcpy(Y, Merger->ClusterY(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); | |
1085 | GPUFailedMsg(cudaMemcpy(Z, Merger->ClusterZ(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); | |
1086 | GPUFailedMsg(cudaMemcpy(Angle, Merger->ClusterAngle(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); | |
1087 | GPUFailedMsg(cudaMemcpy(RowType, Merger->ClusterRowType(), Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyHostToDevice)); | |
1088 | GPUFailedMsg(cudaMemcpy(tracks, Merger->OutputTracks(), Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyHostToDevice)); | |
1089 | GPUFailedMsg(cudaMemcpy(field, Merger->PolinomialFieldBz(), 6 * sizeof(float), cudaMemcpyHostToDevice)); | |
1090 | GPUFailedMsg(cudaMemcpy(param, fSlaveTrackers[0].pParam(), sizeof(AliHLTTPCCAParam), cudaMemcpyHostToDevice)); | |
5ae765b4 | 1091 | AliHLTTPCCATracker::StandaloneQueryTime(&b); |
1092 | RefitTracks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(tracks, Merger->NOutputTracks(), field, X, Y, Z, RowType, Angle, param); | |
d3821846 | 1093 | GPUFailedMsg(cudaThreadSynchronize()); |
5ae765b4 | 1094 | AliHLTTPCCATracker::StandaloneQueryTime(&c); |
d3821846 | 1095 | GPUFailedMsg(cudaMemcpy(Merger->ClusterX(), X, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); |
1096 | GPUFailedMsg(cudaMemcpy(Merger->ClusterY(), Y, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); | |
1097 | GPUFailedMsg(cudaMemcpy(Merger->ClusterZ(), Z, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); | |
1098 | GPUFailedMsg(cudaMemcpy(Merger->ClusterAngle(), Angle, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); | |
1099 | GPUFailedMsg(cudaMemcpy(Merger->ClusterRowType(), RowType, Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyDeviceToHost)); | |
1100 | GPUFailedMsg(cudaMemcpy((void*) Merger->OutputTracks(), tracks, Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyDeviceToHost)); | |
1101 | GPUFailedMsg(cudaThreadSynchronize()); | |
5ae765b4 | 1102 | AliHLTTPCCATracker::StandaloneQueryTime(&d); |
1103 | if (fDebugLevel >= 2) HLTInfo("GPU Merger Finished"); | |
1104 | ||
1105 | if (fDebugLevel > 0) | |
1106 | { | |
1107 | int copysize = 4 * Merger->NOutputTrackClusters() * sizeof(float) + Merger->NOutputTrackClusters() * sizeof(unsigned int) + Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack) + 6 * sizeof(float) + sizeof(AliHLTTPCCAParam); | |
1108 | double speed = (double) copysize * (double) e / (double) (b - a) / 1e9; | |
1109 | printf("GPU Fit:\tCopy To:\t%lld us (%lf GB/s)\n", (b - a) * 1000000 / e, speed); | |
1110 | printf("\t\tFit:\t%lld us\n", (c - b) * 1000000 / e); | |
1111 | speed = (double) copysize * (double) e / (double) (d - c) / 1e9; | |
1112 | printf("\t\tCopy From:\t%lld us (%lf GB/s)\n", (d - c) * 1000000 / e, speed); | |
1113 | } | |
1114 | ||
1115 | cuCtxPopCurrent((CUcontext*) fCudaContext); | |
1116 | return(0); | |
1117 | #endif | |
1118 | } | |
1119 | ||
d3821846 | 1120 | int AliHLTTPCCAGPUTrackerNVCC::GPUMergerAvailable() |
1121 | { | |
1122 | return(1); | |
1123 | } | |
1124 | ||
1125 | void AliHLTTPCCAGPUTrackerNVCC::ActivateThreadContext() | |
5ae765b4 | 1126 | { |
d3821846 | 1127 | cuCtxPushCurrent(*((CUcontext*) fCudaContext)); |
1128 | } | |
1129 | void AliHLTTPCCAGPUTrackerNVCC::ReleaseThreadContext() | |
1130 | { | |
1131 | cuCtxPopCurrent((CUcontext*) fCudaContext); | |
1132 | } | |
1133 | ||
1134 | void AliHLTTPCCAGPUTrackerNVCC::SynchronizeGPU() | |
1135 | { | |
1136 | cudaThreadSynchronize(); | |
5ae765b4 | 1137 | } |
1138 | ||
1139 | AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate() | |
1140 | { | |
1141 | return new AliHLTTPCCAGPUTrackerNVCC; | |
1142 | } | |
1143 | ||
1144 | void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr) | |
1145 | { | |
1146 | delete ptr; | |
1147 | } | |
1148 |