]> git.uio.no Git - u/mrichter/AliRoot.git/blame - HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu
Store list of fired trigger classes.
[u/mrichter/AliRoot.git] / HLT / TPCLib / tracking-ca / AliHLTTPCCAGPUTrackerNVCC.cu
CommitLineData
eb08490f 1// **************************************************************************
2// This file is property of and copyright by the ALICE HLT Project *
3// ALICE Experiment at CERN, All rights reserved. *
4// *
5// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
6// Ivan Kisel <kisel@kip.uni-heidelberg.de> *
7// David Rohr <drohr@kip.uni-heidelberg.de> *
8// for The ALICE HLT Project. *
9// *
10// Permission to use, copy, modify and distribute this software and its *
11// documentation strictly for non-commercial purposes is hereby granted *
12// without fee, provided that the above copyright notice appears in all *
13// copies and that both the copyright notice and this permission notice *
14// appear in the supporting documentation. The authors make no claims *
15// about the suitability of this software for any purpose. It is *
16// provided "as is" without express or implied warranty. *
17// *
18//***************************************************************************
19
20#include "AliHLTTPCCAGPUTracker.h"
21
22#ifdef BUILD_GPU
23
24#include "AliHLTTPCCADef.h"
25#include "AliHLTTPCCAGPUConfig.h"
26
27#include <sm_11_atomic_functions.h>
28#include <sm_12_atomic_functions.h>
29
30#include <iostream>
31
32//Disable assertions since they produce errors in GPU Code
33#ifdef assert
34#undef assert
35#endif
36#define assert(param)
37
38__constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
39#ifdef HLTCA_GPU_TEXTURE_FETCH
40texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
41texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
42texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
43#endif
44
eb08490f 45//Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
46#include "AliHLTTPCCATrackParam.cxx"
47#include "AliHLTTPCCATrack.cxx"
48
eb08490f 49#include "AliHLTTPCCAHitArea.cxx"
50#include "AliHLTTPCCAGrid.cxx"
51#include "AliHLTTPCCARow.cxx"
52#include "AliHLTTPCCAParam.cxx"
53#include "AliHLTTPCCATracker.cxx"
54
eb08490f 55#include "AliHLTTPCCAProcess.h"
56
8566066c 57#include "AliHLTTPCCATrackletSelector.cxx"
eb08490f 58#include "AliHLTTPCCANeighboursFinder.cxx"
eb08490f 59#include "AliHLTTPCCANeighboursCleaner.cxx"
60#include "AliHLTTPCCAStartHitsFinder.cxx"
61#include "AliHLTTPCCAStartHitsSorter.cxx"
62#include "AliHLTTPCCATrackletConstructor.cxx"
63#include "AliHLTTPCCASliceOutput.cxx"
64
65#include "MemoryAssignmentHelpers.h"
66
67#ifndef HLTCA_STANDALONE
68#include "AliHLTDefinitions.h"
69#include "AliHLTSystem.h"
70#endif
71
72ClassImp( AliHLTTPCCAGPUTracker )
73
74bool AliHLTTPCCAGPUTracker::fgGPUUsed = false;
75
76int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
77{
78 //Find best CUDA device, initialize and allocate memory
79
80 if (fgGPUUsed)
81 {
82 HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
83 return(1);
84 }
85
86 cudaDeviceProp fCudaDeviceProp;
87
88#ifndef CUDA_DEVICE_EMULATION
89 int count, bestDevice = -1, bestDeviceSpeed = 0;
90 if (CudaFailedMsg(cudaGetDeviceCount(&count)))
91 {
92 HLTError("Error getting CUDA Device Count");
93 return(1);
94 }
95 if (fDebugLevel >= 2) std::cout << "Available CUDA devices: ";
96 for (int i = 0;i < count;i++)
97 {
98 cudaGetDeviceProperties(&fCudaDeviceProp, i);
99 if (fDebugLevel >= 2) std::cout << fCudaDeviceProp.name << " (" << i << ") ";
100 if (fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate > bestDeviceSpeed)
101 {
102 bestDevice = i;
103 bestDeviceSpeed = fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate;
104 }
105 }
106 if (fDebugLevel >= 2) std::cout << std::endl;
107
108 if (bestDevice == -1)
109 {
110 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
111 return(1);
112 }
113
114 int cudaDevice;
115 if (forceDeviceID == -1)
116 cudaDevice = bestDevice;
117 else
118 cudaDevice = forceDeviceID;
119#else
120 int cudaDevice = 0;
121#endif
122
123 cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice );
124
125 if (fDebugLevel >= 1)
126 {
127 std::cout<<"CUDA Device Properties: "<<std::endl;
128 std::cout<<"name = "<<fCudaDeviceProp.name<<std::endl;
129 std::cout<<"totalGlobalMem = "<<fCudaDeviceProp.totalGlobalMem<<std::endl;
130 std::cout<<"sharedMemPerBlock = "<<fCudaDeviceProp.sharedMemPerBlock<<std::endl;
131 std::cout<<"regsPerBlock = "<<fCudaDeviceProp.regsPerBlock<<std::endl;
132 std::cout<<"warpSize = "<<fCudaDeviceProp.warpSize<<std::endl;
133 std::cout<<"memPitch = "<<fCudaDeviceProp.memPitch<<std::endl;
134 std::cout<<"maxThreadsPerBlock = "<<fCudaDeviceProp.maxThreadsPerBlock<<std::endl;
135 std::cout<<"maxThreadsDim = "<<fCudaDeviceProp.maxThreadsDim[0]<<" "<<fCudaDeviceProp.maxThreadsDim[1]<<" "<<fCudaDeviceProp.maxThreadsDim[2]<<std::endl;
136 std::cout<<"maxGridSize = " <<fCudaDeviceProp.maxGridSize[0]<<" "<<fCudaDeviceProp.maxGridSize[1]<<" "<<fCudaDeviceProp.maxGridSize[2]<<std::endl;
137 std::cout<<"totalConstMem = "<<fCudaDeviceProp.totalConstMem<<std::endl;
138 std::cout<<"major = "<<fCudaDeviceProp.major<<std::endl;
139 std::cout<<"minor = "<<fCudaDeviceProp.minor<<std::endl;
140 std::cout<<"clockRate = "<<fCudaDeviceProp.clockRate<<std::endl;
141 std::cout<<"textureAlignment = "<<fCudaDeviceProp.textureAlignment<<std::endl;
142 }
143
144 if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
145 {
146 HLTError( "Unsupported CUDA Device" );
147 return(1);
148 }
149
150 if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
151 {
152 HLTError("Could not set CUDA Device!");
153 return(1);
154 }
155
156 if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
157 {
158 HLTError("Insufficiant Common Memory");
159 cudaThreadExit();
160 return(1);
161 }
162
163 if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
164 {
165 HLTError("Insufficiant Row Memory");
166 cudaThreadExit();
167 return(1);
168 }
169
170 fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
171 if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
172 {
173 HLTError("CUDA Memory Allocation Error");
174 cudaThreadExit();
175 return(1);
176 }
177 if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
178 int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
179 if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
180 {
181 cudaFree(fGPUMemory);
182 cudaThreadExit();
183 HLTError("Error allocating Page Locked Host Memory");
184 return(1);
185 }
186 if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
187
188 if (fDebugLevel >= 1)
189 {
190 CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
191 }
192 HLTInfo("CUDA Initialisation successfull");
193
194 //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
195 if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
196 {
197 HLTError("Insufficiant Tracker Object Memory");
198 return(1);
199 }
200 fSliceCount = sliceCount;
201 fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
202
203 for (int i = 0;i < fgkNSlices;i++)
204 {
205 fSlaveTrackers[i].SetGPUTracker();
206 fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
8566066c 207 fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
eb08490f 208 }
209
210 fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
211 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
212 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
213 {
214 if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
215 {
216 HLTError("Error creating CUDA Stream");
217 return(1);
218 }
219 }
220
221#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
222 if (fDebugLevel < 2)
223 {
224 //Do one initial run for Benchmark reasons
225 const int useDebugLevel = fDebugLevel;
226 fDebugLevel = 0;
8566066c 227 AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
eb08490f 228
229 std::ifstream fin;
eb08490f 230
eb08490f 231 AliHLTTPCCAParam tmpParam;
232 AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
8566066c 233
234 fin.open("events/settings.dump");
235 int tmpCount;
236 fin >> tmpCount;
237 for (int i = 0;i < sliceCount;i++)
238 {
239 fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
240 tmpParam.ReadSettings(fin);
241 InitializeSliceParam(i, tmpParam);
242 }
243 fin.close();
244
245 fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
246 for (int i = 0;i < sliceCount;i++)
247 {
248 tmpCluster[i].StartReading(i, 0);
249 tmpCluster[i].ReadEvent(fin);
250 tmpCluster[i].FinishReading();
251 }
252 fin.close();
253
254 AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
255 memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
256
257 Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
258 for (int i = 0;i < sliceCount;i++)
259 {
260 free(tmpOutput[i]);
261 tmpOutput[i] = NULL;
262 fSlaveTrackers[i].SetOutputControl(NULL);
263 }
264 delete[] tmpOutput;
265 delete[] tmpCluster;
eb08490f 266 fDebugLevel = useDebugLevel;
267 }
268#endif
269 fgGPUUsed = true;
270 return(0);
271}
272
273template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment)
274{
275 //Macro to align Pointers.
276 //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
277 //(As long as every single data structure is <= 1 MB)
278
279 size_t adr = (size_t) ptr;
280 if (adr % alignment)
281 {
282 adr += alignment - (adr % alignment);
283 }
284 return((T*) adr);
285}
286
287bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
288{
289 //Check for CUDA Error and in the case of an error display the corresponding error string
290 if (error == cudaSuccess) return(false);
291 HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
292 return(true);
293}
294
295int AliHLTTPCCAGPUTracker::CUDASync(char* state)
296{
297 //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
298
299 if (fDebugLevel == 0) return(0);
300 cudaError cuErr;
301 cuErr = cudaGetLastError();
302 if (cuErr != cudaSuccess)
303 {
304 HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state);
305 return(1);
306 }
307 if (CudaFailedMsg(cudaThreadSynchronize()))
308 {
309 HLTError("CUDA Error while synchronizing (%s)", state);
310 return(1);
311 }
312 if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
313 return(0);
314}
315
316void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
317{
318 //Set Debug Level and Debug output File if applicable
319 fDebugLevel = dwLevel;
320 if (NewOutFile) fOutFile = NewOutFile;
321}
322
323int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
324{
325 //Set a specific GPU Tracker Option
326 {
327 HLTError("Unknown Option: %s", OptionName);
328 return(1);
329 }
330 //No Options used at the moment
331 //return(0);
332}
333
334#ifdef HLTCA_STANDALONE
335void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
336{
337 //Run Performance Query for timer i of slice iSlice
338 if (fDebugLevel >= 1)
339 {
340 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
341 }
342}
343#else
344void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
345#endif
346
347void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
348{
349 //Dump Rowblocks to File
350 if (fDebugLevel >= 4)
351 {
352 *fOutFile << "RowBlock Tracklets" << std::endl;
353
354 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
355 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
356 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
357 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
358 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
359 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
360 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
361
362 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
363 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
364 {
365 *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
8566066c 366 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
eb08490f 367 for (int j = 0;j < rowBlockPos[i].x;j++)
368 {
369 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
370 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
371 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
372 {
373 HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
374 }
375 k++;
376 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
377 {
378 HLTError("Error, -1 Tracklet found");
379 }
380 }
8566066c 381 *fOutFile << std::endl << "Phase 2: ";
eb08490f 382 for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
383 {
384 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
385 }
8566066c 386 *fOutFile << std::endl;
eb08490f 387 }
388
389 if (check)
390 {
391 *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
392 for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
393 {
394 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
395 }
396 }
397
398 free(rowBlockPos);
399 free(rowBlockTracklets);
400 free(blockStartingTracklet);
401 }
402}
403
404__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
405{
406 //Initialize GPU RowBlocks and HitWeights
407 int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
408 int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
409 const int stride = blockDim.x * gridDim.x;
410 int4 i0, i1;
411 i0.x = i0.y = i0.z = i0.w = 0;
412 i1.x = i1.y = i1.z = i1.w = -1;
413 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
414 RowBlockPos[i] = i0;
415 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
416 rowBlockTracklets4[i] = i1;
417 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
418 sliceDataHitWeights4[i] = i0;
419}
420
421int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
422{
423 //Primary reconstruction function
424 cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
425
426 if (sliceCountLocal == -1) sliceCountLocal = this->fSliceCount;
427
428 if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
429 {
430 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
431 return(1);
432 }
433
434 if (fDebugLevel >= 4)
435 {
436 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
437 {
8566066c 438 *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
eb08490f 439 }
440 }
441
442 memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
443
444 if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice + sliceCountLocal].Param().ISlice());
445 if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
446
447#ifdef HLTCA_GPU_TIME_PROFILE
c7d6f78b 448 unsigned __int64 a, b, c, d;
8566066c 449 AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
450 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
eb08490f 451#endif
452
453 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
454 {
455 //Make this a GPU Tracker
456 fGpuTracker[iSlice].SetGPUTracker();
457 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
8566066c 458 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
459 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
eb08490f 460
461 //Set Pointers to GPU Memory
462 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
463
464 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
465 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
466 tmpMem = alignPointer(tmpMem, 1024 * 1024);
467
468 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
469 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
470 tmpMem = alignPointer(tmpMem, 1024 * 1024);
471
472 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
473 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
474 tmpMem = alignPointer(tmpMem, 1024 * 1024);
475
476 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
477 {
478 HLTError("Insufficiant Track Memory");
479 return(1);
480 }
481
482 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
483 {
484 HLTError("Insufficiant Global Memory");
485 return(1);
486 }
487
488 //Initialize Startup Constants
489 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
490 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
491 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
492 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
493 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
494 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
495 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
496 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
8566066c 497 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
eb08490f 498 }
499
500#ifdef HLTCA_GPU_TEXTURE_FETCH
501 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
502 size_t offset;
503 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
504 {
505 HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
506 return(1);
507 }
508 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
509 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
510 {
511 HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
512 return(1);
513 }
514 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
515 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
516 {
517 HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
518 return(1);
519 }
520#endif
521
522 //Copy Tracker Object to GPU Memory
523 if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
524#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
525 if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1);
526 CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000));
527#endif
528 CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
529
530 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
531 {
532 StandalonePerfTime(firstSlice + iSlice, 0);
533
534 //Initialize GPU Slave Tracker
535 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
8566066c 536 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
eb08490f 537#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 538 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
eb08490f 539#endif
540 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
541#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 542 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
eb08490f 543 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
544#endif
545 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
546 {
547 HLTError("Insufficiant Slice Data Memory");
548 return(1);
549 }
550
551 /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice())
552 {
553 if (fDebugLevel >= 3) HLTInfo("Slice Empty, not running GPU Tracker");
554 if (sliceCountLocal == 1)
555 return(0);
556 }*/
557
558 //Initialize temporary memory where needed
559 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
560 PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
561
562 //Copy Data to GPU Global Memory
563 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
564 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
565 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
566
567 if (fDebugLevel >= 4)
568 {
569 if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
570 fSlaveTrackers[firstSlice + iSlice].TrackletMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] );
571 fSlaveTrackers[firstSlice + iSlice].SetPointersTracklets( HLTCA_GPU_MAX_TRACKLETS );
572 fSlaveTrackers[firstSlice + iSlice].HitMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100] );
573 fSlaveTrackers[firstSlice + iSlice].SetPointersHits( pClusterData[iSlice].NumberOfClusters() );
574 }
575
576 if (CUDASync("Initialization")) return(1);
577 StandalonePerfTime(firstSlice + iSlice, 1);
578
579 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
580 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
581
582 if (CUDASync("Neighbours finder")) return 1;
583
584 StandalonePerfTime(firstSlice + iSlice, 2);
585
586 if (fDebugLevel >= 4)
587 {
588 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
589 fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
590 }
591
592 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
593 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
594 if (CUDASync("Neighbours Cleaner")) return 1;
595
596 StandalonePerfTime(firstSlice + iSlice, 3);
597
598 if (fDebugLevel >= 4)
599 {
600 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
601 fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
602 }
603
604 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
605 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
606 if (CUDASync("Start Hits Finder")) return 1;
607
608 StandalonePerfTime(firstSlice + iSlice, 4);
609
610 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter");
611 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
612 if (CUDASync("Start Hits Sorter")) return 1;
613
614 StandalonePerfTime(firstSlice + iSlice, 5);
615
616 if (fDebugLevel >= 2)
617 {
618 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
619 if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
620 if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
621 {
622 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
623 return(1);
624 }
625 }
626
627 if (fDebugLevel >= 4)
628 {
629 *fOutFile << "Temporary ";
630 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
631 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
632 uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
633 CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
634 *fOutFile << "Start Hits Sort Vector:" << std::endl;
635 for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
636 {
637 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
638 }
639 free(tmpMemory);
640
641 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
642 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
643 }
644
645 StandalonePerfTime(firstSlice + iSlice, 6);
646
647 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
648 }
649
650 StandalonePerfTime(firstSlice, 7);
651#ifdef HLTCA_GPU_PREFETCHDATA
652 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
653 {
654 if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
655 {
656 HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
657 return(1);
658 }
659 if (fDebugLevel >= 1)
660 {
661 static int infoShown = 0;
662 if (!infoShown)
663 {
664 HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
665 infoShown = 1;
666 }
667 }
668 }
669#endif
670
671 if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
672 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
673 {
674 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
675 if (CUDASync("Tracklet Initializer")) return 1;
676 DumpRowBlocks(fSlaveTrackers, iSlice);
677 }
678
679 if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
680 AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
681 if (CUDASync("Tracklet Constructor (new)")) return 1;
682
683 StandalonePerfTime(firstSlice, 8);
684
685 if (fDebugLevel >= 4)
686 {
687 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
688 {
689 DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
690 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
691 if (fDebugLevel >= 5)
692 {
693 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
694 }
695 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
696 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
697 fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
698 }
699 }
700
c7d6f78b 701 int runSlices = 0;
702 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
eb08490f 703 {
c7d6f78b 704 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
705 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
706 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
eb08490f 707 }
708 if (CUDASync("Tracklet Selector")) return 1;
709 StandalonePerfTime(firstSlice, 9);
710
711 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0]));
712 for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++)
713 {
714 if (iSliceTmp < sliceCountLocal)
715 {
716 int iSlice = iSliceTmp;
717 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
718 cudaStreamSynchronize(cudaStreams[iSlice]);
719 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
720 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
721 if (iSlice + 1 < sliceCountLocal)
722 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1]));
723 }
724
725 if (iSliceTmp)
726 {
727 int iSlice = iSliceTmp - 1;
728 cudaStreamSynchronize(cudaStreams[iSlice]);
729
730 if (fDebugLevel >= 4)
731 {
732 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
733 fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
734 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
735 }
736
737 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
738 {
739 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
740 return(1);
741 }
742 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
743
744 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
745#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 746 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
eb08490f 747#endif
748 fSlaveTrackers[firstSlice + iSlice].WriteOutput();
749#ifdef HLTCA_GPU_TIME_PROFILE
8566066c 750 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
eb08490f 751 printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
752#endif
753
754 if (fDebugLevel >= 4)
755 {
756 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
757 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
758 }
759 }
760 }
761
762 StandalonePerfTime(firstSlice, 10);
763
764 if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
765
766#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
767 char* stageAtSync = (char*) malloc(100000000);
768 CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
769 cudaFree(fGpuTracker[0].fStageAtSync);
770
771 FILE* fp = fopen("profile.txt", "w+");
772 FILE* fp2 = fopen("profile.bmp", "w+b");
773 int nEmptySync = 0, fEmpty;
774
775 const int bmpheight = 1000;
776 BITMAPFILEHEADER bmpFH;
777 BITMAPINFOHEADER bmpIH;
778 ZeroMemory(&bmpFH, sizeof(bmpFH));
779 ZeroMemory(&bmpIH, sizeof(bmpIH));
780
781 bmpFH.bfType = 19778; //"BM"
782 bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
783 bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
784
785 bmpIH.biSize = sizeof(bmpIH);
786 bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
787 bmpIH.biHeight = bmpheight;
788 bmpIH.biPlanes = 1;
789 bmpIH.biBitCount = 32;
790
791 fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
792 fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
793
794 for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
795 {
796 fEmpty = 1;
797 for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
798 {
799 fprintf(fp, "%d\t", stageAtSync[i + j]);
800 int color = 0;
801 if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
802 if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
803 if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
804 if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
805 fwrite(&color, 1, sizeof(int), fp2);
806 if (j > 0 && j % 32 == 0)
807 {
808 color = RGB(255, 255, 255);
809 fwrite(&color, 1, 4, fp2);
810 }
811 if (stageAtSync[i + j]) fEmpty = 0;
812 }
813 fprintf(fp, "\n");
814 if (fEmpty) nEmptySync++;
815 else nEmptySync = 0;
816 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
817 }
818
819 fclose(fp);
820 fclose(fp2);
821 free(stageAtSync);
822#endif
823
824 return(0);
825}
826
827int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
828{
829 //Initialize Slice Tracker Parameter for a slave tracker
830 fSlaveTrackers[iSlice].Initialize(param);
831 if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
832 {
833 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
834 return(1);
835 }
836 return(0);
837}
838
839int AliHLTTPCCAGPUTracker::ExitGPU()
840{
841 //Uninitialize CUDA
842 cudaThreadSynchronize();
843 if (fGPUMemory)
844 {
845 cudaFree(fGPUMemory);
846 fGPUMemory = NULL;
847 }
848 if (fHostLockedMemory)
849 {
850 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
851 {
852 cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
853 }
854 free(fpCudaStreams);
855 fGpuTracker = NULL;
856 cudaFreeHost(fHostLockedMemory);
857 }
858
859 if (CudaFailedMsg(cudaThreadExit()))
860 {
861 HLTError("Could not uninitialize GPU");
862 return(1);
863 }
864 HLTInfo("CUDA Uninitialized");
865 fgGPUUsed = false;
866 return(0);
867}
868
869void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
870{
871 fOutputControl = val;
872 for (int i = 0;i < fgkNSlices;i++)
873 {
874 fSlaveTrackers[i].SetOutputControl(val);
875 }
876}
877
8566066c 878#endif