]> git.uio.no Git - u/mrichter/AliRoot.git/blame - HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu
Update NVIDIA GPU Tracking library to be compatible to AliRoot patch 64473, add preli...
[u/mrichter/AliRoot.git] / HLT / TPCLib / tracking-ca / cagpu / AliHLTTPCCAGPUTrackerNVCC.cu
CommitLineData
5ae765b4 1// **************************************************************************
2// This file is property of and copyright by the ALICE HLT Project *
3// ALICE Experiment at CERN, All rights reserved. *
4// *
5// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
6// Ivan Kisel <kisel@kip.uni-heidelberg.de> *
7// David Rohr <drohr@kip.uni-heidelberg.de> *
8// for The ALICE HLT Project. *
9// *
10// Permission to use, copy, modify and distribute this software and its *
11// documentation strictly for non-commercial purposes is hereby granted *
12// without fee, provided that the above copyright notice appears in all *
13// copies and that both the copyright notice and this permission notice *
14// appear in the supporting documentation. The authors make no claims *
15// about the suitability of this software for any purpose. It is *
16// provided "as is" without express or implied warranty. *
17// *
18//***************************************************************************
19
5ae765b4 20#define FERMI
21#include "AliHLTTPCCAGPUTrackerNVCC.h"
d3821846 22#include "AliHLTTPCCAGPUTrackerCommon.h"
23#define get_global_id(dim) (blockIdx.x * blockDim.x + threadIdx.x)
24#define get_global_size(dim) (blockDim.x * gridDim.x)
25#define get_num_groups(dim) (gridDim.x)
26#define get_local_id(dim) (threadIdx.x)
27#define get_local_size(dim) (blockDim.x)
28#define get_group_id(dim) (blockIdx.x)
5ae765b4 29
5ae765b4 30#include <cuda.h>
31#include <sm_11_atomic_functions.h>
32#include <sm_12_atomic_functions.h>
5ae765b4 33
34__constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
35#ifdef HLTCA_GPU_TEXTURE_FETCH
36texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
37texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
38texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
39#endif
40
41//Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
42#include "AliHLTTPCCATrackParam.cxx"
43#include "AliHLTTPCCATrack.cxx"
44
45#include "AliHLTTPCCAHitArea.cxx"
46#include "AliHLTTPCCAGrid.cxx"
47#include "AliHLTTPCCARow.cxx"
48#include "AliHLTTPCCAParam.cxx"
49#include "AliHLTTPCCATracker.cxx"
50
51#include "AliHLTTPCCAProcess.h"
52
53#include "AliHLTTPCCATrackletSelector.cxx"
54#include "AliHLTTPCCANeighboursFinder.cxx"
55#include "AliHLTTPCCANeighboursCleaner.cxx"
56#include "AliHLTTPCCAStartHitsFinder.cxx"
57#include "AliHLTTPCCAStartHitsSorter.cxx"
58#include "AliHLTTPCCATrackletConstructor.cxx"
59
60#ifdef HLTCA_GPU_MERGER
61#include "AliHLTTPCGMMerger.h"
62#include "AliHLTTPCGMTrackParam.cxx"
63#endif
64
5ae765b4 65ClassImp( AliHLTTPCCAGPUTrackerNVCC )
66
d3821846 67AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() : fpCudaStreams(NULL)
5ae765b4 68{
69 fCudaContext = (void*) new CUcontext;
70};
71
72AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC()
73{
74 delete (CUcontext*) fCudaContext;
75};
76
d3821846 77int AliHLTTPCCAGPUTrackerNVCC::InitGPU_Runtime(int sliceCount, int forceDeviceID)
5ae765b4 78{
79 //Find best CUDA device, initialize and allocate memory
80
5ae765b4 81 cudaDeviceProp fCudaDeviceProp;
82
5ae765b4 83#ifndef CUDA_DEVICE_EMULATION
84 int count, bestDevice = -1;
85 long long int bestDeviceSpeed = 0, deviceSpeed;
d3821846 86 if (GPUFailedMsg(cudaGetDeviceCount(&count)))
5ae765b4 87 {
88 HLTError("Error getting CUDA Device Count");
5ae765b4 89 return(1);
90 }
91 if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
d3821846 92#if defined(FERMI) || defined(KEPLER)
5ae765b4 93 const int reqVerMaj = 2;
94 const int reqVerMin = 0;
95#else
96 const int reqVerMaj = 1;
97 const int reqVerMin = 2;
98#endif
99 for (int i = 0;i < count;i++)
100 {
101 if (fDebugLevel >= 4) printf("Examining device %d\n", i);
102#if CUDA_VERSION > 3010
103 size_t free, total;
104#else
105 unsigned int free, total;
106#endif
107 cuInit(0);
108 CUdevice tmpDevice;
109 cuDeviceGet(&tmpDevice, i);
110 CUcontext tmpContext;
111 cuCtxCreate(&tmpContext, 0, tmpDevice);
112 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
113 cuCtxDestroy(tmpContext);
114 if (fDebugLevel >= 4) printf("Obtained current memory usage for device %d\n", i);
d3821846 115 if (GPUFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;
5ae765b4 116 if (fDebugLevel >= 4) printf("Obtained device properties for device %d\n", i);
117 int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < reqVerMaj || (fCudaDeviceProp.major == reqVerMaj && fCudaDeviceProp.minor < reqVerMin)) && free >= fGPUMemSize + 100 * 1024 + 1024;
118#ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
119 //if (sliceCount > fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER) deviceOK = 0;
120#endif
121
122 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %lld / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, (long long int) free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
123 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free * (long long int) fCudaDeviceProp.major * (long long int) fCudaDeviceProp.major;
124 if (deviceOK && deviceSpeed > bestDeviceSpeed)
125 {
126 bestDevice = i;
127 bestDeviceSpeed = deviceSpeed;
128 }
129 }
130 if (bestDevice == -1)
131 {
132 HLTWarning("No %sCUDA Device available, aborting CUDA Initialisation", count ? "appropriate " : "");
133 HLTInfo("Requiring Revision %d.%d, Mem: %lld, Multiprocessors: %d", reqVerMaj, reqVerMin, fGPUMemSize + 100 * 1024 * 1024, sliceCount);
5ae765b4 134 return(1);
135 }
136
137 if (forceDeviceID == -1)
138 fCudaDevice = bestDevice;
139 else
140 fCudaDevice = forceDeviceID;
141#else
142 fCudaDevice = 0;
143#endif
144
145 cudaGetDeviceProperties(&fCudaDeviceProp ,fCudaDevice );
146
147 if (fDebugLevel >= 1)
148 {
149 HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
150 HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);
151 HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);
152 HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
153 HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
154 HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);
155 HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
156 HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
157 HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
158 HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);
159 HLTInfo("major = %d", fCudaDeviceProp.major);
160 HLTInfo("minor = %d", fCudaDeviceProp.minor);
161 HLTInfo("clockRate = %d", fCudaDeviceProp.clockRate);
162 HLTInfo("memoryClockRate = %d", fCudaDeviceProp.memoryClockRate);
163 HLTInfo("multiProcessorCount = %d", fCudaDeviceProp.multiProcessorCount);
164 HLTInfo("textureAlignment = %lld", (unsigned long long int) fCudaDeviceProp.textureAlignment);
165 }
166 fConstructorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER;
167 selectorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_SELECTOR_MULTIPLIER;
168
169 if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
170 {
171 HLTError( "Unsupported CUDA Device" );
5ae765b4 172 return(1);
173 }
174
175 if (cuCtxCreate((CUcontext*) fCudaContext, CU_CTX_SCHED_AUTO, fCudaDevice) != CUDA_SUCCESS)
176 {
177 HLTError("Could not set CUDA Device!");
5ae765b4 178 return(1);
179 }
180
d3821846 181 if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || GPUFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
5ae765b4 182 {
183 HLTError("CUDA Memory Allocation Error");
184 cudaThreadExit();
5ae765b4 185 return(1);
186 }
187 fGPUMergerMemory = ((char*) fGPUMemory) + fGPUMemSize - fGPUMergerMaxMemory;
5ae765b4 188 if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
189 int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
190#ifdef HLTCA_GPU_MERGER
191 hostMemSize += fGPUMergerMaxMemory;
192#endif
d3821846 193 if (GPUFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
5ae765b4 194 {
195 cudaFree(fGPUMemory);
196 cudaThreadExit();
197 HLTError("Error allocating Page Locked Host Memory");
198 return(1);
199 }
200 if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
201 fGPUMergerHostMemory = ((char*) fHostLockedMemory) + hostMemSize - fGPUMergerMaxMemory;
202
203 if (fDebugLevel >= 1)
204 {
d3821846 205 GPUFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
5ae765b4 206 }
207
208 fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
209 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
210 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
211 {
d3821846 212 if (GPUFailedMsg(cudaStreamCreate(&cudaStreams[i])))
5ae765b4 213 {
214 cudaFree(fGPUMemory);
215 cudaFreeHost(fHostLockedMemory);
216 cudaThreadExit();
217 HLTError("Error creating CUDA Stream");
218 return(1);
219 }
220 }
221
5ae765b4 222 cuCtxPopCurrent((CUcontext*) fCudaContext);
5ae765b4 223 HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", fCudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount);
224
5ae765b4 225 return(0);
226}
227
d3821846 228bool AliHLTTPCCAGPUTrackerNVCC::GPUFailedMsgA(cudaError_t error, const char* file, int line)
5ae765b4 229{
230 //Check for CUDA Error and in the case of an error display the corresponding error string
231 if (error == cudaSuccess) return(false);
232 HLTWarning("CUDA Error: %d / %s (%s:%d)", error, cudaGetErrorString(error), file, line);
233 return(true);
234}
235
d3821846 236int AliHLTTPCCAGPUTrackerNVCC::GPUSync(char* state, int stream, int slice)
5ae765b4 237{
238 //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
239
240 if (fDebugLevel == 0) return(0);
241 cudaError cuErr;
242 cuErr = cudaGetLastError();
243 if (cuErr != cudaSuccess)
244 {
d3821846 245 HLTError("Cuda Error %s while running kernel (%s) (Stream %d; %d/%d)", cudaGetErrorString(cuErr), state, stream, slice, fgkNSlices);
5ae765b4 246 return(1);
247 }
d3821846 248 if (GPUFailedMsg(cudaThreadSynchronize()))
5ae765b4 249 {
d3821846 250 HLTError("CUDA Error while synchronizing (%s) (Stream %d; %d/%d)", state, stream, slice, fgkNSlices);
5ae765b4 251 return(1);
252 }
253 if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
254 return(0);
255}
256
5ae765b4 257#if defined(BITWISE_COMPATIBLE_DEBUG_OUTPUT) || defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER)
258void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker*, int, bool) {}
259#else
260void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
261{
262 //Dump Rowblocks to File
263 if (fDebugLevel >= 4)
264 {
265 *fOutFile << "RowBlock Tracklets (Slice" << tracker[iSlice].Param().ISlice() << " (" << iSlice << " of reco))";
266 *fOutFile << " after Tracklet Reconstruction";
267 *fOutFile << std::endl;
268
269 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
270 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
271 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * fConstructorBlockCount);
d3821846 272 GPUFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
273 GPUFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
274 GPUFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * fConstructorBlockCount, cudaMemcpyDeviceToHost));
275 GPUFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
5ae765b4 276
277 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
278 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
279 {
d3821846 280 *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
281 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
282 for (int j = 0;j < rowBlockPos[i].x;j++)
283 {
284 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
285 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
286#ifdef HLTCA_GPU_SCHED_FIXED_START
287 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
288 {
289 HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
290 }
291#endif //HLTCA_GPU_SCHED_FIXED_START
292 k++;
293 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
294 {
295 HLTError("Error, -1 Tracklet found");
296 }
297 }
298 *fOutFile << std::endl << "Phase 2: ";
299 for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
300 {
301 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
302 }
303 *fOutFile << std::endl;
5ae765b4 304 }
305
d3821846 306 if (check)
5ae765b4 307 {
d3821846 308 *fOutFile << "Starting Threads: (Slice" << tracker[iSlice].Param().ISlice() << ", First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
309 for (int i = 0;i < fConstructorBlockCount;i++)
310 {
311 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
312 }
5ae765b4 313 }
314
d3821846 315 free(rowBlockPos);
316 free(rowBlockTracklets);
317 free(blockStartingTracklet);
5ae765b4 318 }
d3821846 319}
320#endif
321
322__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
323{
324 //Initialize GPU RowBlocks and HitWeights
325 int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
326 const int stride = get_global_size(0);
327 int4 i0;
328 i0.x = i0.y = i0.z = i0.w = 0;
329#ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
330 int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
331 int4 i1;
332 i1.x = i1.y = i1.z = i1.w = -1;
333 for (int i = get_global_id(0);i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
334 RowBlockPos[i] = i0;
335 for (int i = get_global_id(0);i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
336 rowBlockTracklets4[i] = i1;
337#endif
338 for (int i = get_global_id(0);i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
339 sliceDataHitWeights4[i] = i0;
340}
341
342int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
343{
344 //Primary reconstruction function
345
346 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
347
348 if (Reconstruct_Base_Init(pOutput, pClusterData, firstSlice, sliceCountLocal)) return(1);
5ae765b4 349
350#ifdef HLTCA_GPU_TEXTURE_FETCH
351 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
352 size_t offset;
d3821846 353 if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
5ae765b4 354 {
355 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
5ae765b4 356 ResetHelperThreads(0);
357 return(1);
358 }
359 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
d3821846 360 if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
5ae765b4 361 {
362 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
5ae765b4 363 ResetHelperThreads(0);
364 return(1);
365 }
366 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
d3821846 367 if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
5ae765b4 368 {
369 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
5ae765b4 370 ResetHelperThreads(0);
371 return(1);
372 }
373#endif
374
375 //Copy Tracker Object to GPU Memory
376 if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
377#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
378 char* tmpMem;
d3821846 379 if (GPUFailedMsg(cudaMalloc(&tmpMem, 100000000)))
5ae765b4 380 {
381 HLTError("Error allocating CUDA profile memory");
5ae765b4 382 ResetHelperThreads(0);
383 return(1);
384 }
385 fGpuTracker[0].fStageAtSync = tmpMem;
d3821846 386 GPUFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));
5ae765b4 387#endif
d3821846 388 GPUFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
389 if (GPUSync("Initialization (1)", 0, firstSlice) RANDOM_ERROR)
5ae765b4 390 {
5ae765b4 391 ResetHelperThreads(0);
392 return(1);
393 }
394
5ae765b4 395 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
396 {
d3821846 397 if (Reconstruct_Base_SliceInit(pClusterData, iSlice, firstSlice)) return(1);
5ae765b4 398
399 //Initialize temporary memory where needed
400 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
401 PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
d3821846 402 if (GPUSync("Initialization (2)", 2, iSlice + firstSlice) RANDOM_ERROR)
5ae765b4 403 {
5ae765b4 404 ResetHelperThreads(1);
405 return(1);
406 }
407
408 //Copy Data to GPU Global Memory
d3821846 409 GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
410 GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
411 GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
5ae765b4 412
413 if (fDebugLevel >= 4)
414 {
415 if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
416 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
417 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
418 }
419
d3821846 420 if (GPUSync("Initialization (3)", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
5ae765b4 421 {
5ae765b4 422 ResetHelperThreads(1);
423 return(1);
424 }
425 StandalonePerfTime(firstSlice + iSlice, 1);
426
427 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);
428 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), HLTCA_GPU_THREAD_COUNT_FINDER, 0, cudaStreams[iSlice & 1]>>>(iSlice);
429
d3821846 430 if (GPUSync("Neighbours finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
5ae765b4 431 {
5ae765b4 432 ResetHelperThreads(1);
433 return(1);
434 }
435
436 StandalonePerfTime(firstSlice + iSlice, 2);
437
438 if (fDebugLevel >= 4)
439 {
d3821846 440 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
5ae765b4 441 if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
442 }
443
444 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);
445 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
d3821846 446 if (GPUSync("Neighbours Cleaner", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
5ae765b4 447 {
5ae765b4 448 ResetHelperThreads(1);
449 return(1);
450 }
451
452 StandalonePerfTime(firstSlice + iSlice, 3);
453
454 if (fDebugLevel >= 4)
455 {
d3821846 456 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
5ae765b4 457 if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
458 }
459
460 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);
461 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
d3821846 462 if (GPUSync("Start Hits Finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
5ae765b4 463 {
5ae765b4 464 ResetHelperThreads(1);
465 return(1);
466 }
467
468 StandalonePerfTime(firstSlice + iSlice, 4);
469
470 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);
471 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
d3821846 472 if (GPUSync("Start Hits Sorter", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR)
5ae765b4 473 {
5ae765b4 474 ResetHelperThreads(1);
475 return(1);
476 }
477
478 StandalonePerfTime(firstSlice + iSlice, 5);
479
480 if (fDebugLevel >= 2)
481 {
d3821846 482 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
5ae765b4 483 if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);
484 if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS RANDOM_ERROR)
485 {
486 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
5ae765b4 487 ResetHelperThreads(1);
488 return(1);
489 }
490 }
491
492 if (fDebugLevel >= 4 && *fSlaveTrackers[firstSlice + iSlice].NTracklets())
493 {
494#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
d3821846 495 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
5ae765b4 496 if (fDebugMask & 8)
497 {
498 *fOutFile << "Temporary ";
499 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
500 }
501 uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
d3821846 502 GPUFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
5ae765b4 503 if (fDebugMask & 16)
504 {
505 *fOutFile << "Start Hits Sort Vector:" << std::endl;
506 for (int i = 1;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 5;i++)
507 {
508 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
509 }
510 }
511 free(tmpMemory);
512#endif
513
d3821846 514 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
5ae765b4 515 if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
516 }
517
518 StandalonePerfTime(firstSlice + iSlice, 6);
519
520 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
521 }
522
523 for (int i = 0;i < fNHelperThreads;i++)
524 {
525 pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
526 }
527
528 StandalonePerfTime(firstSlice, 7);
529
530#ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
531 int nHardCollisions = 0;
532
533RestartTrackletConstructor:
534 if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
535 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
536 {
537 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
d3821846 538 if (GPUSync("Tracklet Initializer", -1, iSlice + firstSlice) RANDOM_ERROR)
5ae765b4 539 {
540 cudaThreadSynchronize();
541 cuCtxPopCurrent((CUcontext*) fCudaContext);
542 return(1);
543 }
544 if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice);
545 }
546#endif
547
548 if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
549 AliHLTTPCCATrackletConstructorGPU<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>();
d3821846 550 if (GPUSync("Tracklet Constructor", -1, firstSlice) RANDOM_ERROR)
5ae765b4 551 {
552 cudaThreadSynchronize();
553 cuCtxPopCurrent((CUcontext*) fCudaContext);
554 return(1);
555 }
556
557 StandalonePerfTime(firstSlice, 8);
558
559 if (fDebugLevel >= 4)
560 {
561 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
562 {
563 if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
d3821846 564 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
5ae765b4 565 if (fDebugLevel >= 5)
566 {
567 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
568 }
d3821846 569 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
570 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
5ae765b4 571 if (0 && fSlaveTrackers[firstSlice + iSlice].NTracklets() && fSlaveTrackers[firstSlice + iSlice].Tracklet(0).NHits() < 0)
572 {
573 cudaThreadSynchronize();
574 cuCtxPopCurrent((CUcontext*) fCudaContext);
575 printf("INTERNAL ERROR\n");
576 return(1);
577 }
578 if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
579 }
580 }
581
582 int runSlices = 0;
583 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
584 {
585 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
586 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
587 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
d3821846 588 if (GPUSync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR)
5ae765b4 589 {
590 cudaThreadSynchronize();
591 cuCtxPopCurrent((CUcontext*) fCudaContext);
592 return(1);
593 }
594 }
595 StandalonePerfTime(firstSlice, 9);
596
597 char *tmpMemoryGlobalTracking = NULL;
598 fSliceOutputReady = 0;
d3821846 599
600 if (Reconstruct_Base_StartGlobal(pOutput, tmpMemoryGlobalTracking)) return(1);
5ae765b4 601
602 int tmpSlice = 0, tmpSlice2 = 0;
603 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
604 {
605 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
606
607 while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == (cudaError_t) CUDA_SUCCESS))
608 {
d3821846 609 if (GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice])) RANDOM_ERROR)
5ae765b4 610 {
611 ResetHelperThreads(1);
d3821846 612 ActivateThreadContext();
5ae765b4 613 return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
614 }
615 tmpSlice++;
616 }
617
618 while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == (cudaError_t) CUDA_SUCCESS)
619 {
d3821846 620 if (*fSlaveTrackers[firstSlice + tmpSlice2].NTracks() > 0)
621 {
622 GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
623 GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
624 }
5ae765b4 625 tmpSlice2++;
626 }
627
d3821846 628 if (GPUFailedMsg(cudaStreamSynchronize(cudaStreams[iSlice])) RANDOM_ERROR)
5ae765b4 629 {
630 ResetHelperThreads(1);
d3821846 631 ActivateThreadContext();
5ae765b4 632 return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
633 }
634
635 if (fDebugLevel >= 4)
636 {
d3821846 637 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
5ae765b4 638#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
639 if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
640#endif
641 if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
642 }
643
644 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError RANDOM_ERROR)
645 {
646#ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
647 if ((fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION || fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_WRONG_ROW)&& nHardCollisions++ < 10)
648 {
649 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION)
650 {
651 HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor (Slice %d)", firstSlice + iSlice);
652 }
653 else
654 {
655 HLTWarning("Tracklet Constructor returned invalid row (Slice %d)", firstSlice + iSlice);
656 }
657 if (fDebugLevel >= 4)
658 {
659 ResetHelperThreads(1);
5ae765b4 660 return(1);
661 }
662 for (int i = 0;i < sliceCountLocal;i++)
663 {
664 cudaThreadSynchronize();
d3821846 665 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
5ae765b4 666 *fSlaveTrackers[firstSlice + i].NTracks() = 0;
667 *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
668 fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
d3821846 669 GPUFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
5ae765b4 670 PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
671 }
672 goto RestartTrackletConstructor;
673 }
674#endif
675 HLTError("GPU Tracker returned Error Code %d in slice %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError, firstSlice + iSlice);
5ae765b4 676 ResetHelperThreads(1);
677 return(1);
678 }
679 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
680
d3821846 681 if (Reconstruct_Base_FinishSlices(pOutput, iSlice, firstSlice)) return(1);
5ae765b4 682
683 if (fDebugLevel >= 4)
684 {
685 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
686 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
687 }
688 }
689
d3821846 690 if (Reconstruct_Base_Finalize(pOutput, tmpMemoryGlobalTracking, firstSlice)) return(1);
5ae765b4 691
692 /*for (int i = firstSlice;i < firstSlice + sliceCountLocal;i++)
693 {
694 fSlaveTrackers[i].DumpOutput(stdout);
695 }*/
696
697 /*static int runnum = 0;
698 std::ofstream tmpOut;
699 char buffer[1024];
700 sprintf(buffer, "GPUtracks%d.out", runnum++);
701 tmpOut.open(buffer);
702 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
703 {
704 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(tmpOut);
705 }
706 tmpOut.close();*/
707
708#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
709 char* stageAtSync = (char*) malloc(100000000);
d3821846 710 GPUFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
5ae765b4 711 cudaFree(fGpuTracker[0].StageAtSync());
712
713 FILE* fp = fopen("profile.txt", "w+");
714 FILE* fp2 = fopen("profile.bmp", "w+b");
715 int nEmptySync = 0, fEmpty;
716
717 const int bmpheight = 8192;
718 BITMAPFILEHEADER bmpFH;
719 BITMAPINFOHEADER bmpIH;
720 ZeroMemory(&bmpFH, sizeof(bmpFH));
721 ZeroMemory(&bmpIH, sizeof(bmpIH));
722
723 bmpFH.bfType = 19778; //"BM"
724 bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1) * bmpheight ;
725 bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
726
727 bmpIH.biSize = sizeof(bmpIH);
728 bmpIH.biWidth = fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1;
729 bmpIH.biHeight = bmpheight;
730 bmpIH.biPlanes = 1;
731 bmpIH.biBitCount = 32;
732
733 fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
734 fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
735
736 for (int i = 0;i < bmpheight * fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;i += fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR)
737 {
738 fEmpty = 1;
739 for (int j = 0;j < fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;j++)
740 {
741 fprintf(fp, "%d\t", stageAtSync[i + j]);
742 int color = 0;
743 if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
744 if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
745 if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
746 if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
747 fwrite(&color, 1, sizeof(int), fp2);
748 if (j > 0 && j % 32 == 0)
749 {
750 color = RGB(255, 255, 255);
751 fwrite(&color, 1, 4, fp2);
752 }
753 if (stageAtSync[i + j]) fEmpty = 0;
754 }
755 fprintf(fp, "\n");
756 if (fEmpty) nEmptySync++;
757 else nEmptySync = 0;
758 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
759 }
760
761 fclose(fp);
762 fclose(fp2);
763 free(stageAtSync);
764#endif
765
766 cuCtxPopCurrent((CUcontext*) fCudaContext);
767 return(0);
768}
769
770__global__ void ClearPPHitWeights(int sliceCount)
771{
772 //Clear HitWeights
773
774 for (int k = 0;k < sliceCount;k++)
775 {
776 AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k];
777 int4* const pHitWeights = (int4*) tracker.Data().HitWeights();
778 const int dwCount = tracker.Data().NumberOfHitsPlusAlign();
d3821846 779 const int stride = get_global_size(0);
5ae765b4 780 int4 i0;
781 i0.x = i0.y = i0.z = i0.w = 0;
782
d3821846 783 for (int i = get_global_id(0);i < dwCount * sizeof(int) / sizeof(int4);i += stride)
5ae765b4 784 {
785 pHitWeights[i] = i0;
786 }
787 }
788}
789
790int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
791{
792 //Primary reconstruction function for small events (PP)
793
794 memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
795
796 if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
797
798 char* tmpSliceMemHost = (char*) SliceDataMemory(fHostLockedMemory, 0);
799 char* tmpSliceMemGpu = (char*) SliceDataMemory(fGPUMemory, 0);
800
801 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
802 {
803 StandalonePerfTime(firstSlice + iSlice, 0);
804
805 //Initialize GPU Slave Tracker
806 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
807 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(tmpSliceMemHost, RowMemory(fHostLockedMemory, firstSlice + iSlice));
808 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
809 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
810 {
811 HLTError("Insufficiant Slice Data Memory");
812 return(1);
813 }
814
815 //Make this a GPU Tracker
816 fGpuTracker[iSlice].SetGPUTracker();
817 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
818
5ae765b4 819 fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice));
820 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
821
822 tmpSliceMemHost += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
823 tmpSliceMemHost = alignPointer(tmpSliceMemHost, 64 * 1024);
824 tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
825 tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024);
826
5ae765b4 827 //Set Pointers to GPU Memory
828 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
829
830 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
831 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
832 tmpMem = alignPointer(tmpMem, 64 * 1024);
833
834 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
835 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
836 tmpMem = alignPointer(tmpMem, 64 * 1024);
837
838 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
839 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
840 tmpMem = alignPointer(tmpMem, 64 * 1024);
841
842 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
843 {
844 HLTError("Insufficiant Track Memory");
845 return(1);
846 }
847
848 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
849 {
850 HLTError("Insufficiant Global Memory");
851 return(1);
852 }
853
854 //Initialize Startup Constants
855 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
856 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
857 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
858 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
859
860 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
861
d3821846 862 if (GPUSync("Initialization", -1, iSlice + firstSlice)) return(1);
5ae765b4 863 StandalonePerfTime(firstSlice + iSlice, 1);
864 }
865
866#ifdef HLTCA_GPU_TEXTURE_FETCH
867 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
868 size_t offset;
d3821846 869 if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
5ae765b4 870 {
871 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
872 return(1);
873 }
874 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
d3821846 875 if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
5ae765b4 876 {
877 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
878 return(1);
879 }
880 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
d3821846 881 if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
5ae765b4 882 {
883 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
884 return(1);
885 }
886#endif
887
888 //Copy Tracker Object to GPU Memory
889 if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
d3821846 890 GPUFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));
5ae765b4 891
892 //Copy Data to GPU Global Memory
893 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
894 {
d3821846 895 GPUFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));
5ae765b4 896 //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize());
897 }
d3821846 898 //GPUFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));
5ae765b4 899 //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0)));
d3821846 900 GPUFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));
901 GPUFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));
5ae765b4 902
903 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
904 AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_FINDER>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows());
d3821846 905 if (GPUSync("Neighbours finder", -1, firstSlice)) return 1;
5ae765b4 906 StandalonePerfTime(firstSlice, 2);
907 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
908 AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2);
d3821846 909 if (GPUSync("Neighbours Cleaner", -1, firstSlice)) return 1;
5ae765b4 910 StandalonePerfTime(firstSlice, 3);
911 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
912 AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6);
d3821846 913 if (GPUSync("Start Hits Finder", -1, firstSlice)) return 1;
5ae765b4 914 StandalonePerfTime(firstSlice, 4);
915
916 ClearPPHitWeights <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(sliceCountLocal);
d3821846 917 if (GPUSync("Clear Hit Weights", -1, firstSlice)) return 1;
5ae765b4 918
919 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
920 {
921 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
922 }
923
924 StandalonePerfTime(firstSlice, 7);
925
926 if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
927 AliHLTTPCCATrackletConstructorGPUPP<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>(0, sliceCountLocal);
d3821846 928 if (GPUSync("Tracklet Constructor PP", -1, firstSlice)) return 1;
5ae765b4 929
930 StandalonePerfTime(firstSlice, 8);
931
932 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR>>>(0, sliceCountLocal);
d3821846 933 if (GPUSync("Tracklet Selector", -1, firstSlice)) return 1;
5ae765b4 934 StandalonePerfTime(firstSlice, 9);
935
d3821846 936 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));
5ae765b4 937
938 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
939 {
940 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
941
d3821846 942 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));
943 GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));
5ae765b4 944
945 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
946 {
947 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
948 return(1);
949 }
950 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
951 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks;
952 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits;
953 }
954
955 if (fGlobalTracking && sliceCountLocal == fgkNSlices)
956 {
957 char tmpMemory[sizeof(AliHLTTPCCATracklet)
958#ifdef EXTERN_ROW_HITS
959 + HLTCA_ROW_COUNT * sizeof(int)
960#endif
961 + 16];
962
963 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
964 {
965 if (fSlaveTrackers[iSlice].CommonMemory()->fNTracklets)
966 {
967 HLTError("Slave tracker tracklets found where none expected, memory not freed!\n");
968 }
969 fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(&tmpMemory[0], 1, fConstructorBlockCount);
970 fSlaveTrackers[iSlice].CommonMemory()->fNTracklets = 1;
971 }
972
973 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
974 {
975 int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
976 int sliceRight = (iSlice + 1) % (fgkNSlices / 2);
977 if (iSlice >= fgkNSlices / 2)
978 {
979 sliceLeft += fgkNSlices / 2;
980 sliceRight += fgkNSlices / 2;
981 }
982 fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS);
983 }
984
985 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
986 {
987 printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits);
988 }
989 }
990
991 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
992 {
993 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
994 fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare();
995 fSlaveTrackers[firstSlice + iSlice].WriteOutput();
996 }
997
998 StandalonePerfTime(firstSlice, 10);
999
1000 if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
1001
1002 return(0);
1003}
1004
d3821846 1005int AliHLTTPCCAGPUTrackerNVCC::ExitGPU_Runtime()
5ae765b4 1006{
1007 //Uninitialize CUDA
1008 cuCtxPushCurrent(*((CUcontext*) fCudaContext));
1009
1010 cudaThreadSynchronize();
1011 if (fGPUMemory)
1012 {
1013 cudaFree(fGPUMemory);
1014 fGPUMemory = NULL;
1015 }
1016 if (fHostLockedMemory)
1017 {
1018 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
1019 {
1020 cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
1021 }
1022 free(fpCudaStreams);
1023 fGpuTracker = NULL;
1024 cudaFreeHost(fHostLockedMemory);
1025 }
1026
d3821846 1027 if (GPUFailedMsg(cudaThreadExit()))
5ae765b4 1028 {
1029 HLTError("Could not uninitialize GPU");
1030 return(1);
1031 }
1032
5ae765b4 1033 cuCtxDestroy(*((CUcontext*) fCudaContext));
1034
1035 cudaDeviceReset();
1036
1037 HLTInfo("CUDA Uninitialized");
1038 fCudaInitialized = 0;
1039 return(0);
1040}
1041
5ae765b4 1042int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger)
1043{
1044#ifndef HLTCA_GPU_MERGER
1045 HLTError("HLTCA_GPU_MERGER compile flag not set");
1046 return(1);
1047#else
1048 if (!fCudaInitialized)
1049 {
1050 HLTError("GPU Merger not initialized");
1051 return(1);
1052 }
1053 unsigned long long int a, b, c, d, e;
1054 AliHLTTPCCATracker::StandaloneQueryFreq(&e);
1055
1056 char* gpumem = (char*) fGPUMergerMemory;
1057 float *X, *Y, *Z, *Angle;
1058 unsigned int *RowType;
1059 AliHLTTPCGMMergedTrack* tracks;
1060 float* field;
1061 AliHLTTPCCAParam* param;
1062
1063 gpumem = alignPointer(gpumem, 1024 * 1024);
1064
1065 AssignMemory(X, gpumem, Merger->NClusters());
1066 AssignMemory(Y, gpumem, Merger->NClusters());
1067 AssignMemory(Z, gpumem, Merger->NClusters());
1068 AssignMemory(Angle, gpumem, Merger->NClusters());
1069 AssignMemory(RowType, gpumem, Merger->NClusters());
1070 AssignMemory(tracks, gpumem, Merger->NOutputTracks());
1071 AssignMemory(field, gpumem, 6);
1072 AssignMemory(param, gpumem, 1);
1073
5ae765b4 1074 if ((size_t) (gpumem - (char*) fGPUMergerMemory) > (size_t) fGPUMergerMaxMemory)
1075 {
1076 HLTError("Insufficiant GPU Merger Memory");
1077 }
1078
1079 cuCtxPushCurrent(*((CUcontext*) fCudaContext));
1080
1081 if (fDebugLevel >= 2) HLTInfo("Running GPU Merger (%d/%d)", Merger->NOutputTrackClusters(), Merger->NClusters());
1082 AliHLTTPCCATracker::StandaloneQueryTime(&a);
d3821846 1083 GPUFailedMsg(cudaMemcpy(X, Merger->ClusterX(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
1084 GPUFailedMsg(cudaMemcpy(Y, Merger->ClusterY(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
1085 GPUFailedMsg(cudaMemcpy(Z, Merger->ClusterZ(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
1086 GPUFailedMsg(cudaMemcpy(Angle, Merger->ClusterAngle(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
1087 GPUFailedMsg(cudaMemcpy(RowType, Merger->ClusterRowType(), Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyHostToDevice));
1088 GPUFailedMsg(cudaMemcpy(tracks, Merger->OutputTracks(), Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyHostToDevice));
1089 GPUFailedMsg(cudaMemcpy(field, Merger->PolinomialFieldBz(), 6 * sizeof(float), cudaMemcpyHostToDevice));
1090 GPUFailedMsg(cudaMemcpy(param, fSlaveTrackers[0].pParam(), sizeof(AliHLTTPCCAParam), cudaMemcpyHostToDevice));
5ae765b4 1091 AliHLTTPCCATracker::StandaloneQueryTime(&b);
1092 RefitTracks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(tracks, Merger->NOutputTracks(), field, X, Y, Z, RowType, Angle, param);
d3821846 1093 GPUFailedMsg(cudaThreadSynchronize());
5ae765b4 1094 AliHLTTPCCATracker::StandaloneQueryTime(&c);
d3821846 1095 GPUFailedMsg(cudaMemcpy(Merger->ClusterX(), X, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
1096 GPUFailedMsg(cudaMemcpy(Merger->ClusterY(), Y, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
1097 GPUFailedMsg(cudaMemcpy(Merger->ClusterZ(), Z, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
1098 GPUFailedMsg(cudaMemcpy(Merger->ClusterAngle(), Angle, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
1099 GPUFailedMsg(cudaMemcpy(Merger->ClusterRowType(), RowType, Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyDeviceToHost));
1100 GPUFailedMsg(cudaMemcpy((void*) Merger->OutputTracks(), tracks, Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyDeviceToHost));
1101 GPUFailedMsg(cudaThreadSynchronize());
5ae765b4 1102 AliHLTTPCCATracker::StandaloneQueryTime(&d);
1103 if (fDebugLevel >= 2) HLTInfo("GPU Merger Finished");
1104
1105 if (fDebugLevel > 0)
1106 {
1107 int copysize = 4 * Merger->NOutputTrackClusters() * sizeof(float) + Merger->NOutputTrackClusters() * sizeof(unsigned int) + Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack) + 6 * sizeof(float) + sizeof(AliHLTTPCCAParam);
1108 double speed = (double) copysize * (double) e / (double) (b - a) / 1e9;
1109 printf("GPU Fit:\tCopy To:\t%lld us (%lf GB/s)\n", (b - a) * 1000000 / e, speed);
1110 printf("\t\tFit:\t%lld us\n", (c - b) * 1000000 / e);
1111 speed = (double) copysize * (double) e / (double) (d - c) / 1e9;
1112 printf("\t\tCopy From:\t%lld us (%lf GB/s)\n", (d - c) * 1000000 / e, speed);
1113 }
1114
1115 cuCtxPopCurrent((CUcontext*) fCudaContext);
1116 return(0);
1117#endif
1118}
1119
d3821846 1120int AliHLTTPCCAGPUTrackerNVCC::GPUMergerAvailable()
1121{
1122 return(1);
1123}
1124
1125void AliHLTTPCCAGPUTrackerNVCC::ActivateThreadContext()
5ae765b4 1126{
d3821846 1127 cuCtxPushCurrent(*((CUcontext*) fCudaContext));
1128}
1129void AliHLTTPCCAGPUTrackerNVCC::ReleaseThreadContext()
1130{
1131 cuCtxPopCurrent((CUcontext*) fCudaContext);
1132}
1133
1134void AliHLTTPCCAGPUTrackerNVCC::SynchronizeGPU()
1135{
1136 cudaThreadSynchronize();
5ae765b4 1137}
1138
1139AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
1140{
1141 return new AliHLTTPCCAGPUTrackerNVCC;
1142}
1143
1144void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)
1145{
1146 delete ptr;
1147}
1148