]> git.uio.no Git - u/mrichter/AliRoot.git/blob - HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu
GPU tracker update
[u/mrichter/AliRoot.git] / HLT / TPCLib / tracking-ca / AliHLTTPCCAGPUTrackerNVCC.cu
1 // **************************************************************************\r
2 // This file is property of and copyright by the ALICE HLT Project          *\r
3 // ALICE Experiment at CERN, All rights reserved.                           *\r
4 //                                                                          *\r
5 // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *\r
6 //                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *\r
7 //                                      David Rohr <drohr@kip.uni-heidelberg.de>                                *\r
8 //                  for The ALICE HLT Project.                              *\r
9 //                                                                          *\r
10 // Permission to use, copy, modify and distribute this software and its     *\r
11 // documentation strictly for non-commercial purposes is hereby granted     *\r
12 // without fee, provided that the above copyright notice appears in all     *\r
13 // copies and that both the copyright notice and this permission notice     *\r
14 // appear in the supporting documentation. The authors make no claims       *\r
15 // about the suitability of this software for any purpose. It is            *\r
16 // provided "as is" without express or implied warranty.                    *\r
17 //                                                                          *\r
18 //***************************************************************************\r
19 \r
20 #include "AliHLTTPCCAGPUTrackerNVCC.h"\r
21 \r
22 #ifdef HLTCA_GPUCODE\r
23 #include <cuda.h>\r
24 #include <sm_11_atomic_functions.h>\r
25 #include <sm_12_atomic_functions.h>\r
26 #endif\r
27 \r
28 #ifdef R__WIN32\r
29 #else\r
30 #include <sys/syscall.h>\r
31 #include <semaphore.h>\r
32 #include <fcntl.h>\r
33 #endif\r
34 \r
35 #include "AliHLTTPCCADef.h"\r
36 #include "AliHLTTPCCAGPUConfig.h"\r
37 \r
38 \r
39 #include <iostream>\r
40 \r
41 //Disable assertions since they produce errors in GPU Code\r
42 #ifdef assert\r
43 #undef assert\r
44 #endif\r
45 #define assert(param)\r
46 \r
47 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];\r
48 #ifdef HLTCA_GPU_TEXTURE_FETCH\r
49 texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;\r
50 texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;\r
51 texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;\r
52 #endif\r
53 \r
54 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code\r
55 #include "AliHLTTPCCATrackParam.cxx"\r
56 #include "AliHLTTPCCATrack.cxx" \r
57 \r
58 #include "AliHLTTPCCAHitArea.cxx"\r
59 #include "AliHLTTPCCAGrid.cxx"\r
60 #include "AliHLTTPCCARow.cxx"\r
61 #include "AliHLTTPCCAParam.cxx"\r
62 #include "AliHLTTPCCATracker.cxx"\r
63 \r
64 #include "AliHLTTPCCAProcess.h"\r
65 \r
66 #include "AliHLTTPCCATrackletSelector.cxx"\r
67 #include "AliHLTTPCCANeighboursFinder.cxx"\r
68 #include "AliHLTTPCCANeighboursCleaner.cxx"\r
69 #include "AliHLTTPCCAStartHitsFinder.cxx"\r
70 #include "AliHLTTPCCAStartHitsSorter.cxx"\r
71 #include "AliHLTTPCCATrackletConstructor.cxx"\r
72 \r
73 #include "MemoryAssignmentHelpers.h"\r
74 \r
75 #ifndef HLTCA_STANDALONE\r
76 #include "AliHLTDefinitions.h"\r
77 #include "AliHLTSystem.h"\r
78 #endif\r
79 \r
80 ClassImp( AliHLTTPCCAGPUTrackerNVCC )\r
81 \r
82 bool AliHLTTPCCAGPUTrackerNVCC::fgGPUUsed = false;\r
83 \r
84 #define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"\r
85 \r
86 AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :\r
87         fGpuTracker(NULL),\r
88         fGPUMemory(NULL),\r
89         fHostLockedMemory(NULL),\r
90         fDebugLevel(0),\r
91         fDebugMask(0xFFFFFFFF),\r
92         fOutFile(NULL),\r
93         fGPUMemSize(0),\r
94         fpCudaStreams(NULL),\r
95         fSliceCount(0),\r
96         fOutputControl(NULL),\r
97         fThreadId(0),\r
98         fCudaInitialized(0),\r
99         fPPMode(0)\r
100         {};\r
101 \r
102 AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() {};\r
103 \r
104 void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)\r
105 {\r
106         //Release the global named semaphore that locks GPU Initialization\r
107 #ifdef R__WIN32\r
108         HANDLE* h = (HANDLE*) sem;\r
109         ReleaseSemaphore(*h, 1, NULL);\r
110         CloseHandle(*h);\r
111         delete h;\r
112 #else\r
113         sem_t* pSem = (sem_t*) sem;\r
114         sem_post(pSem);\r
115         sem_unlink(SemLockName);\r
116 #endif\r
117 }\r
118 \r
119 int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)\r
120 {\r
121         //Check constants for correct memory sizes\r
122   if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)\r
123   {\r
124           HLTError("Insufficiant Tracker Object Memory");\r
125           return(1);\r
126   }\r
127 \r
128   if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)\r
129   {\r
130           HLTError("Insufficiant Common Memory");\r
131           return(1);\r
132   }\r
133 \r
134   if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)\r
135   {\r
136           HLTError("Insufficiant Row Memory");\r
137           return(1);\r
138   }\r
139 \r
140   if (fDebugLevel >= 3)\r
141   {\r
142           HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, fgkNSlices * AliHLTTPCCATracker::CommonMemorySize(), HLTCA_GPU_COMMON_MEMORY, fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), HLTCA_GPU_ROWS_MEMORY);\r
143   }\r
144   return(0);\r
145 }\r
146 \r
147 int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)\r
148 {\r
149         //Find best CUDA device, initialize and allocate memory\r
150 \r
151         if (CheckMemorySizes(sliceCount)) return(1);\r
152 \r
153 #ifdef R__WIN32\r
154         HANDLE* semLock = new HANDLE;\r
155         *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);\r
156         if (*semLock == NULL)\r
157         {\r
158                 HLTError("Error creating GPUInit Semaphore");\r
159                 return(1);\r
160         }\r
161         WaitForSingleObject(*semLock, INFINITE);\r
162 #else\r
163         sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);\r
164         if (semLock == SEM_FAILED)\r
165         {\r
166                 HLTError("Error creating GPUInit Semaphore");\r
167                 return(1);\r
168         }\r
169         sem_wait(semLock);\r
170 #endif\r
171 \r
172         if (fgGPUUsed)\r
173         {\r
174             HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");\r
175                 ReleaseGlobalLock(semLock);\r
176             return(1);\r
177         }\r
178         fgGPUUsed = 1;\r
179         fThreadId = GetThread();\r
180 \r
181         cudaDeviceProp fCudaDeviceProp;\r
182 \r
183         fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);\r
184 \r
185 #ifndef CUDA_DEVICE_EMULATION\r
186         int count, bestDevice = -1;\r
187         long long int bestDeviceSpeed = 0, deviceSpeed;\r
188         if (CudaFailedMsg(cudaGetDeviceCount(&count)))\r
189         {\r
190                 HLTError("Error getting CUDA Device Count");\r
191                 fgGPUUsed = 0;\r
192                 ReleaseGlobalLock(semLock);\r
193                 return(1);\r
194         }\r
195         if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");\r
196         for (int i = 0;i < count;i++)\r
197         {\r
198                 unsigned int free, total;\r
199                 cuInit(0);\r
200                 CUdevice tmpDevice;\r
201                 cuDeviceGet(&tmpDevice, i);\r
202                 CUcontext tmpContext;\r
203                 cuCtxCreate(&tmpContext, 0, tmpDevice);\r
204                 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";\r
205                 cuCtxDestroy(tmpContext);\r
206                 if (CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;\r
207 \r
208                 int deviceOK = sliceCount <= fCudaDeviceProp.multiProcessorCount && fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;\r
209 \r
210                 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");\r
211                 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;\r
212                 if (deviceOK && deviceSpeed > bestDeviceSpeed)\r
213                 {\r
214                         bestDevice = i;\r
215                         bestDeviceSpeed = deviceSpeed;\r
216                 }\r
217         }\r
218         if (bestDevice == -1)\r
219         {\r
220                 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");\r
221                 HLTInfo("Requiring Revision 1.3, Mem: %d", fGPUMemSize);\r
222                 fgGPUUsed = 0;\r
223                 ReleaseGlobalLock(semLock);\r
224                 return(1);\r
225         }\r
226 \r
227   int cudaDevice;\r
228   if (forceDeviceID == -1)\r
229           cudaDevice = bestDevice;\r
230   else\r
231           cudaDevice = forceDeviceID;\r
232 #else\r
233         int cudaDevice = 0;\r
234 #endif\r
235 \r
236   cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice ); \r
237 \r
238   if (fDebugLevel >= 1)\r
239   {\r
240           HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);\r
241           HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);\r
242           HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);\r
243           HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);\r
244           HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);\r
245           HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);\r
246           HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);\r
247           HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);\r
248           HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);\r
249           HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);\r
250           HLTInfo("major = %d", fCudaDeviceProp.major);\r
251           HLTInfo("minor = %d", fCudaDeviceProp.minor);\r
252           HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);\r
253           HLTInfo("textureAlignment %lld= ", (unsigned long long int) fCudaDeviceProp.textureAlignment);\r
254   }\r
255 \r
256   if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))\r
257   {\r
258         HLTError( "Unsupported CUDA Device" );\r
259         fgGPUUsed = 0;\r
260         ReleaseGlobalLock(semLock);\r
261         return(1);\r
262   }\r
263 \r
264   if (CudaFailedMsg(cudaSetDevice(cudaDevice)))\r
265   {\r
266           HLTError("Could not set CUDA Device!");\r
267           fgGPUUsed = 0;\r
268           ReleaseGlobalLock(semLock);\r
269           return(1);\r
270   }\r
271 \r
272   if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))\r
273   {\r
274           HLTError("CUDA Memory Allocation Error");\r
275           cudaThreadExit();\r
276           fgGPUUsed = 0;\r
277           ReleaseGlobalLock(semLock);\r
278           return(1);\r
279   }\r
280   ReleaseGlobalLock(semLock);\r
281   if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);\r
282   int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;\r
283   if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))\r
284   {\r
285           cudaFree(fGPUMemory);\r
286           cudaThreadExit();\r
287           HLTError("Error allocating Page Locked Host Memory");\r
288           fgGPUUsed = 0;\r
289           return(1);\r
290   }\r
291   if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);\r
292 \r
293   if (fDebugLevel >= 1)\r
294   {\r
295           CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));\r
296   }\r
297 \r
298   fSliceCount = sliceCount;\r
299   //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory\r
300   fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);\r
301 \r
302   for (int i = 0;i < fgkNSlices;i++)\r
303   {\r
304     fSlaveTrackers[i].SetGPUTracker();\r
305         fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));\r
306         fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));\r
307   }\r
308 \r
309   fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));\r
310   cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;\r
311   for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)\r
312   {\r
313         if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))\r
314         {\r
315             cudaFree(fGPUMemory);\r
316             cudaFreeHost(fHostLockedMemory);\r
317             cudaThreadExit();\r
318             HLTError("Error creating CUDA Stream");\r
319             fgGPUUsed = 0;\r
320             return(1);\r
321         }\r
322   }\r
323 \r
324   fCudaInitialized = 1;\r
325   HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", cudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount);\r
326 \r
327 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)\r
328   if (fDebugLevel < 2)\r
329   {\r
330           //Do one initial run for Benchmark reasons\r
331           const int useDebugLevel = fDebugLevel;\r
332           fDebugLevel = 0;\r
333           AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];\r
334 \r
335           std::ifstream fin;\r
336 \r
337           AliHLTTPCCAParam tmpParam;\r
338           AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;\r
339 \r
340           fin.open("events/settings.dump");\r
341           int tmpCount;\r
342           fin >> tmpCount;\r
343           for (int i = 0;i < sliceCount;i++)\r
344           {\r
345                 fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);\r
346                 tmpParam.ReadSettings(fin);\r
347                 InitializeSliceParam(i, tmpParam);\r
348           }\r
349           fin.close();\r
350 \r
351           fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);\r
352           for (int i = 0;i < sliceCount;i++)\r
353           {\r
354                 tmpCluster[i].StartReading(i, 0);\r
355                 tmpCluster[i].ReadEvent(fin);\r
356                 tmpCluster[i].FinishReading();\r
357           }\r
358           fin.close();\r
359 \r
360           AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];\r
361           memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));\r
362 \r
363           Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);\r
364           for (int i = 0;i < sliceCount;i++)\r
365           {\r
366                   free(tmpOutput[i]);\r
367                   tmpOutput[i] = NULL;\r
368                   fSlaveTrackers[i].SetOutputControl(NULL);\r
369           }\r
370           delete[] tmpOutput;\r
371           delete[] tmpCluster;\r
372           fDebugLevel = useDebugLevel;\r
373   }\r
374 #endif\r
375   return(0);\r
376 }\r
377 \r
378 template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)\r
379 {\r
380         //Macro to align Pointers.\r
381         //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker\r
382         //(As long as every single data structure is <= 1 MB)\r
383 \r
384         size_t adr = (size_t) ptr;\r
385         if (adr % alignment)\r
386         {\r
387                 adr += alignment - (adr % alignment);\r
388         }\r
389         return((T*) adr);\r
390 }\r
391 \r
392 bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsg(cudaError_t error)\r
393 {\r
394         //Check for CUDA Error and in the case of an error display the corresponding error string\r
395         if (error == cudaSuccess) return(false);\r
396         HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));\r
397         return(true);\r
398 }\r
399 \r
400 int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice)\r
401 {\r
402         //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards\r
403 \r
404         if (fDebugLevel == 0) return(0);\r
405         cudaError cuErr;\r
406         cuErr = cudaGetLastError();\r
407         if (cuErr != cudaSuccess)\r
408         {\r
409                 HLTError("Cuda Error %s while running kernel (%s) (Slice %d; %d/%d)", cudaGetErrorString(cuErr), state, sliceLocal, slice, fgkNSlices);\r
410                 return(1);\r
411         }\r
412         if (CudaFailedMsg(cudaThreadSynchronize()))\r
413         {\r
414                 HLTError("CUDA Error while synchronizing (%s) (Slice %d; %d/%d)", state, sliceLocal, slice, fgkNSlices);\r
415                 return(1);\r
416         }\r
417         if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");\r
418         return(0);\r
419 }\r
420 \r
421 void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)\r
422 {\r
423         //Set Debug Level and Debug output File if applicable\r
424         fDebugLevel = dwLevel;\r
425         if (NewOutFile) fOutFile = NewOutFile;\r
426 }\r
427 \r
428 int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int OptionValue)\r
429 {\r
430         //Set a specific GPU Tracker Option\r
431         if (strcmp(OptionName, "PPMode") == 0)\r
432         {\r
433                 fPPMode = OptionValue;\r
434         }\r
435         else if (strcmp(OptionName, "DebugMask") == 0)\r
436         {\r
437                 fDebugMask = OptionValue;\r
438         }\r
439         else\r
440         {\r
441                 HLTError("Unknown Option: %s", OptionName);\r
442                 return(1);\r
443         }\r
444         return(0);\r
445 }\r
446 \r
447 #ifdef HLTCA_STANDALONE\r
448 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)\r
449 {\r
450   //Run Performance Query for timer i of slice iSlice\r
451   if (fDebugLevel >= 1)\r
452   {\r
453           AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));\r
454   }\r
455 }\r
456 #else\r
457 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}\r
458 #endif\r
459 \r
460 void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)\r
461 {\r
462         //Dump Rowblocks to File\r
463         if (fDebugLevel >= 4)\r
464         {\r
465                 *fOutFile << "RowBlock Tracklets (Slice " << tracker[iSlice].Param().ISlice() << " (" << iSlice << " of reco))";\r
466                 *fOutFile << " after Tracklet Reconstruction";\r
467                 *fOutFile << std::endl;\r
468         \r
469                 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);\r
470                 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);\r
471                 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);\r
472                 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));\r
473                 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));\r
474                 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));\r
475                 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
476 \r
477                 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;\r
478                 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)\r
479                 {\r
480                         *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << \r
481                                 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";\r
482                         for (int j = 0;j < rowBlockPos[i].x;j++)\r
483                         {\r
484                                 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!\r
485                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";\r
486 #ifdef HLTCA_GPU_SCHED_FIXED_START\r
487                                 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)\r
488                                 {\r
489                                         HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);\r
490                                 }\r
491 #endif //HLTCA_GPU_SCHED_FIXED_START\r
492                                 k++;\r
493                                 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)\r
494                                 {\r
495                                         HLTError("Error, -1 Tracklet found");\r
496                                 }\r
497                         }\r
498                         *fOutFile << std::endl << "Phase 2: ";\r
499                         for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)\r
500                         {\r
501                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";\r
502                         }\r
503                         *fOutFile << std::endl;\r
504                 }\r
505 \r
506                 if (check)\r
507                 {\r
508                         *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;\r
509                         for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)\r
510                         {\r
511                                 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;\r
512                         }\r
513                 }\r
514 \r
515                 free(rowBlockPos);\r
516                 free(rowBlockTracklets);\r
517                 free(blockStartingTracklet);\r
518         }\r
519 }\r
520 \r
521 __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)\r
522 {\r
523         //Initialize GPU RowBlocks and HitWeights\r
524         int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;\r
525         int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;\r
526         const int stride = blockDim.x * gridDim.x;\r
527         int4 i0, i1;\r
528         i0.x = i0.y = i0.z = i0.w = 0;\r
529         i1.x = i1.y = i1.z = i1.w = -1;\r
530         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)\r
531                 RowBlockPos[i] = i0;\r
532         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)\r
533                 rowBlockTracklets4[i] = i1;\r
534         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)\r
535                 sliceDataHitWeights4[i] = i0;\r
536 }\r
537 \r
538 int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)\r
539 {\r
540         //Primary reconstruction function\r
541 \r
542         cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;\r
543 \r
544         if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;\r
545         \r
546         if (!fCudaInitialized)\r
547         {\r
548             HLTError("GPUTracker not initialized");\r
549             return(1);\r
550         }\r
551         if (sliceCountLocal > fSliceCount)\r
552         {\r
553             HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);\r
554             return(1);\r
555         }\r
556         if (fThreadId != GetThread())\r
557         {\r
558             HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());\r
559             return(1);\r
560         }\r
561 \r
562         if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);\r
563 \r
564         if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)\r
565         {\r
566                 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));\r
567                 return(1);\r
568         }\r
569 \r
570         if (fPPMode) return(ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal));\r
571 \r
572         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);\r
573 \r
574         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");\r
575 \r
576 #ifdef HLTCA_GPU_TIME_PROFILE\r
577         unsigned long long int a, b, c, d;\r
578         AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);\r
579         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);\r
580 #endif\r
581         \r
582         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
583         {\r
584                 //Make this a GPU Tracker\r
585                 fGpuTracker[iSlice].SetGPUTracker();\r
586                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));\r
587                 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));\r
588                 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);\r
589 \r
590                 //Set Pointers to GPU Memory\r
591                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);\r
592 \r
593                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");\r
594                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());\r
595                 tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
596 \r
597                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");\r
598                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);\r
599                 tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
600 \r
601                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");\r
602                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());\r
603                 tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
604 \r
605                 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)\r
606                 {\r
607                         HLTError("Insufficiant Track Memory");\r
608                         return(1);\r
609                 }\r
610 \r
611                 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)\r
612                 {\r
613                         HLTError("Insufficiant Global Memory");\r
614                         return(1);\r
615                 }\r
616 \r
617                 if (fDebugLevel >= 3)\r
618                 {\r
619                         HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice), HLTCA_GPU_GLOBAL_MEMORY, fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY);\r
620                 }\r
621 \r
622                 //Initialize Startup Constants\r
623                 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;\r
624                 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;\r
625                 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;\r
626                 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;\r
627                 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);\r
628                 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;\r
629                 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;\r
630                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;\r
631                 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());\r
632         }\r
633 \r
634 #ifdef HLTCA_GPU_TEXTURE_FETCH\r
635                 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();\r
636                 size_t offset;\r
637                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
638                 {\r
639                         HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);\r
640                         return(1);\r
641                 }\r
642                 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();\r
643                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
644                 {\r
645                         HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);\r
646                         return(1);\r
647                 }\r
648                 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();\r
649                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
650                 {\r
651                         HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);\r
652                         return(1);\r
653                 }\r
654 #endif\r
655 \r
656         //Copy Tracker Object to GPU Memory\r
657         if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");\r
658 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE\r
659         char* tmpMem;\r
660         if (CudaFailedMsg(cudaMalloc(&tmpMem, 100000000))) return(1);\r
661         fGpuTracker[0].fStageAtSync = tmpMem;\r
662         CudaFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));\r
663 #endif\r
664         CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));\r
665 \r
666         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
667         {\r
668                 StandalonePerfTime(firstSlice + iSlice, 0);\r
669 \r
670                 //Initialize GPU Slave Tracker\r
671                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");\r
672                 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));\r
673 #ifdef HLTCA_GPU_TIME_PROFILE\r
674                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
675 #endif\r
676                 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);\r
677 \r
678                   if (fDebugLevel >= 4)\r
679                   {\r
680                           *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl;\r
681 \r
682                           if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);\r
683                   }\r
684 \r
685 #ifdef HLTCA_GPU_TIME_PROFILE\r
686                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
687                 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
688 #endif\r
689                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)\r
690                 {\r
691                         HLTError("Insufficiant Slice Data Memory");\r
692                         return(1);\r
693                 }\r
694 \r
695                 if (fDebugLevel >= 3)\r
696                 {\r
697                         HLTInfo("GPU Slice Data Memory Used: %d/%d", fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY);\r
698                 }\r
699 \r
700                 //Initialize temporary memory where needed\r
701                 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");           \r
702                 PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());\r
703 \r
704                 //Copy Data to GPU Global Memory\r
705                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
706                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
707                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
708 \r
709                 if (fDebugLevel >= 4)\r
710                 {\r
711                         if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");\r
712                         fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS);\r
713                         fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );\r
714                 }\r
715                 \r
716                 if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);\r
717                 StandalonePerfTime(firstSlice + iSlice, 1);\r
718 \r
719                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);\r
720                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
721 \r
722                 if (CUDASync("Neighbours finder", iSlice, iSlice + firstSlice)) return 1;\r
723 \r
724                 StandalonePerfTime(firstSlice + iSlice, 2);\r
725 \r
726                 if (fDebugLevel >= 4)\r
727                 {\r
728                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));\r
729                         if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);\r
730                 }\r
731 \r
732                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);\r
733                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
734                 if (CUDASync("Neighbours Cleaner", iSlice, iSlice + firstSlice)) return 1;\r
735 \r
736                 StandalonePerfTime(firstSlice + iSlice, 3);\r
737 \r
738                 if (fDebugLevel >= 4)\r
739                 {\r
740                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));\r
741                         if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);\r
742                 }\r
743 \r
744                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);\r
745                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
746                 if (CUDASync("Start Hits Finder", iSlice, iSlice + firstSlice)) return 1;\r
747 \r
748                 StandalonePerfTime(firstSlice + iSlice, 4);\r
749 \r
750                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);\r
751                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
752                 if (CUDASync("Start Hits Sorter", iSlice, iSlice + firstSlice)) return 1;\r
753 \r
754                 StandalonePerfTime(firstSlice + iSlice, 5);\r
755 \r
756                 if (fDebugLevel >= 2)\r
757                 {\r
758                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
759                         if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);\r
760                         if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)\r
761                         {\r
762                                 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");\r
763                                 return(1);\r
764                         }\r
765                 }\r
766 \r
767                 if (fDebugLevel >= 4)\r
768                 {\r
769                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));\r
770                         if (fDebugMask & 8)\r
771                         {\r
772                                 *fOutFile << "Temporary ";\r
773                                 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);\r
774                         }\r
775                         uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());\r
776                         CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));\r
777                         if (fDebugMask & 16)\r
778                         {\r
779                                 *fOutFile << "Start Hits Sort Vector:" << std::endl;\r
780                                 for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)\r
781                                 {\r
782                                         *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;\r
783                                 }\r
784                         }\r
785                         free(tmpMemory);\r
786 \r
787                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));\r
788                         if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);\r
789                 }\r
790 \r
791                 StandalonePerfTime(firstSlice + iSlice, 6);\r
792                 \r
793                 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());\r
794         }\r
795 \r
796         StandalonePerfTime(firstSlice, 7);\r
797 #ifdef HLTCA_GPU_PREFETCHDATA\r
798         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
799         {\r
800                 if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))\r
801                 {\r
802                         HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));\r
803                         return(1);\r
804                 }\r
805                 if (fDebugLevel >= 1)\r
806                 {\r
807                         static int infoShown = 0;\r
808                         if (!infoShown)\r
809                         {\r
810                                 HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));\r
811                                 infoShown = 1;\r
812                         }\r
813                 }\r
814         }\r
815 #endif\r
816 \r
817         int nHardCollisions = 0;\r
818 \r
819 RestartTrackletConstructor:\r
820         if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");\r
821         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
822         {\r
823                 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);\r
824                 if (CUDASync("Tracklet Initializer", iSlice, iSlice + firstSlice)) return 1;\r
825                 if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice);\r
826         }\r
827 \r
828         if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");\r
829         AliHLTTPCCATrackletConstructorGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();\r
830         if (CUDASync("Tracklet Constructor", 0, firstSlice)) return 1;\r
831         \r
832         StandalonePerfTime(firstSlice, 8);\r
833 \r
834         if (fDebugLevel >= 4)\r
835         {\r
836                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
837                 {\r
838                         if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);\r
839                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
840                         if (fDebugLevel >= 5)\r
841                         {\r
842                                 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());\r
843                         }\r
844                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));\r
845                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));\r
846                         if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);\r
847                 }\r
848         }\r
849 \r
850         int runSlices = 0;\r
851         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)\r
852         {\r
853                 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;\r
854                 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);\r
855                 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));\r
856                 if (CUDASync("Tracklet Selector", iSlice, iSlice + firstSlice)) return 1;\r
857         }\r
858         StandalonePerfTime(firstSlice, 9);\r
859 \r
860         int tmpSlice = 0, tmpSlice2 = 0;\r
861         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
862         {\r
863                 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");\r
864 \r
865                 while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == CUDA_SUCCESS))\r
866                 {\r
867                         if (CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice]))) return(1);\r
868                         tmpSlice++;\r
869                 }\r
870 \r
871                 while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == CUDA_SUCCESS)\r
872                 {\r
873                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));\r
874                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));\r
875                         tmpSlice2++;\r
876                 }\r
877 \r
878                 cudaStreamSynchronize(cudaStreams[iSlice]);\r
879 \r
880                 if (fDebugLevel >= 4)\r
881                 {\r
882                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));\r
883                         if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);\r
884                         if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);\r
885                 }\r
886 \r
887                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)\r
888                 {\r
889                         if ((fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION || fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_WRONG_ROW)&& nHardCollisions++ < 10)\r
890                         {\r
891                                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION)\r
892                                 {\r
893                                         HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");\r
894                                 }\r
895                                 else\r
896                                 {\r
897                                         HLTWarning("Tracklet Constructor returned invalid row");\r
898                                 }\r
899                                 for (int i = 0;i < sliceCountLocal;i++)\r
900                                 {\r
901                                         cudaThreadSynchronize();\r
902                                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
903                                         *fSlaveTrackers[firstSlice + i].NTracks() = 0;\r
904                                         *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;\r
905                                         fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;\r
906                                         CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));\r
907                                         PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());\r
908                                 }\r
909                                 goto RestartTrackletConstructor;\r
910                         }\r
911                         HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);\r
912                         return(1);\r
913                 }\r
914                 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());\r
915 \r
916                 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);\r
917 #ifdef HLTCA_GPU_TIME_PROFILE\r
918                 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
919 #endif\r
920                 fSlaveTrackers[firstSlice + iSlice].WriteOutput();\r
921 #ifdef HLTCA_GPU_TIME_PROFILE\r
922                 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
923                 printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
924 #endif\r
925 \r
926                 if (fDebugLevel >= 4)\r
927                 {\r
928                         delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();\r
929                         delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();\r
930                 }\r
931         }\r
932 \r
933         StandalonePerfTime(firstSlice, 10);\r
934 \r
935         if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");\r
936 \r
937 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE\r
938         char* stageAtSync = (char*) malloc(100000000);\r
939         CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));\r
940         cudaFree(fGpuTracker[0].StageAtSync());\r
941 \r
942         FILE* fp = fopen("profile.txt", "w+");\r
943         FILE* fp2 = fopen("profile.bmp", "w+b");\r
944         int nEmptySync = 0, fEmpty;\r
945 \r
946         const int bmpheight = 1000;\r
947         BITMAPFILEHEADER bmpFH;\r
948         BITMAPINFOHEADER bmpIH;\r
949         ZeroMemory(&bmpFH, sizeof(bmpFH));\r
950         ZeroMemory(&bmpIH, sizeof(bmpIH));\r
951         \r
952         bmpFH.bfType = 19778; //"BM"\r
953         bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;\r
954         bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);\r
955 \r
956         bmpIH.biSize = sizeof(bmpIH);\r
957         bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;\r
958         bmpIH.biHeight = bmpheight;\r
959         bmpIH.biPlanes = 1;\r
960         bmpIH.biBitCount = 32;\r
961 \r
962         fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);\r
963         fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);  \r
964 \r
965         for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)\r
966         {\r
967                 fEmpty = 1;\r
968                 for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)\r
969                 {\r
970                         fprintf(fp, "%d\t", stageAtSync[i + j]);\r
971                         int color = 0;\r
972                         if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);\r
973                         if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);\r
974                         if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);\r
975                         if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);\r
976                         fwrite(&color, 1, sizeof(int), fp2);\r
977                         if (j > 0 && j % 32 == 0)\r
978                         {\r
979                                 color = RGB(255, 255, 255);\r
980                                 fwrite(&color, 1, 4, fp2);\r
981                         }\r
982                         if (stageAtSync[i + j]) fEmpty = 0;\r
983                 }\r
984                 fprintf(fp, "\n");\r
985                 if (fEmpty) nEmptySync++;\r
986                 else nEmptySync = 0;\r
987                 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;\r
988         }\r
989 \r
990         fclose(fp);\r
991         fclose(fp2);\r
992         free(stageAtSync);\r
993 #endif \r
994 \r
995         return(0);\r
996 }\r
997 \r
998 __global__ void ClearPPHitWeights(int sliceCount)\r
999 {\r
1000         //Clear HitWeights\r
1001         \r
1002         for (int k = 0;k < sliceCount;k++)\r
1003         {\r
1004                 AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k];\r
1005                 int4* const pHitWeights = (int4*) tracker.Data().HitWeights();\r
1006                 const int dwCount = tracker.Data().NumberOfHitsPlusAlign();\r
1007                 const int stride = blockDim.x * gridDim.x;\r
1008                 int4 i0;\r
1009                 i0.x = i0.y = i0.z = i0.w = 0;\r
1010         \r
1011                 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < dwCount * sizeof(int) / sizeof(int4);i += stride)\r
1012                 {\r
1013                         pHitWeights[i] = i0;\r
1014                 }\r
1015         }\r
1016 }\r
1017 \r
1018 int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)\r
1019 {\r
1020         //Primary reconstruction function for small events (PP)\r
1021 \r
1022         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);\r
1023 \r
1024         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");\r
1025 \r
1026 #ifdef HLTCA_GPU_TIME_PROFILE\r
1027         unsigned long long int a, b, c, d;\r
1028         AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);\r
1029         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);\r
1030 #endif\r
1031 \r
1032         char* tmpSliceMemHost = (char*) SliceDataMemory(fHostLockedMemory, 0);\r
1033         char* tmpSliceMemGpu = (char*) SliceDataMemory(fGPUMemory, 0);\r
1034 \r
1035         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
1036         {\r
1037                 StandalonePerfTime(firstSlice + iSlice, 0);\r
1038 \r
1039                 //Initialize GPU Slave Tracker\r
1040                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");\r
1041                 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(tmpSliceMemHost, RowMemory(fHostLockedMemory, firstSlice + iSlice));\r
1042 #ifdef HLTCA_GPU_TIME_PROFILE\r
1043                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
1044 #endif\r
1045                 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);\r
1046 #ifdef HLTCA_GPU_TIME_PROFILE\r
1047                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
1048                 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
1049 #endif\r
1050                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)\r
1051                 {\r
1052                         HLTError("Insufficiant Slice Data Memory");\r
1053                         return(1);\r
1054                 }\r
1055 \r
1056                 //Make this a GPU Tracker\r
1057                 fGpuTracker[iSlice].SetGPUTracker();\r
1058                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));\r
1059 \r
1060 \r
1061                 fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice));\r
1062                 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);\r
1063 \r
1064                 tmpSliceMemHost += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();\r
1065                 tmpSliceMemHost = alignPointer(tmpSliceMemHost, 64 * 1024);\r
1066                 tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();\r
1067                 tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024);\r
1068 \r
1069 \r
1070                 //Set Pointers to GPU Memory\r
1071                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);\r
1072 \r
1073                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");\r
1074                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());\r
1075                 tmpMem = alignPointer(tmpMem, 64 * 1024);\r
1076 \r
1077                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");\r
1078                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);\r
1079                 tmpMem = alignPointer(tmpMem, 64 * 1024);\r
1080 \r
1081                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");\r
1082                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());\r
1083                 tmpMem = alignPointer(tmpMem, 64 * 1024);\r
1084 \r
1085                 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)\r
1086                 {\r
1087                         HLTError("Insufficiant Track Memory");\r
1088                         return(1);\r
1089                 }\r
1090 \r
1091                 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)\r
1092                 {\r
1093                         HLTError("Insufficiant Global Memory");\r
1094                         return(1);\r
1095                 }\r
1096 \r
1097                 //Initialize Startup Constants\r
1098                 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;\r
1099                 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;\r
1100                 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;\r
1101                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;\r
1102 \r
1103                 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());\r
1104 \r
1105                 if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);\r
1106                 StandalonePerfTime(firstSlice + iSlice, 1);\r
1107         }\r
1108 \r
1109 #ifdef HLTCA_GPU_TEXTURE_FETCH\r
1110                 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();\r
1111                 size_t offset;\r
1112                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
1113                 {\r
1114                         HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);\r
1115                         return(1);\r
1116                 }\r
1117                 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();\r
1118                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
1119                 {\r
1120                         HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);\r
1121                         return(1);\r
1122                 }\r
1123                 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();\r
1124                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
1125                 {\r
1126                         HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);\r
1127                         return(1);\r
1128                 }\r
1129 #endif\r
1130 \r
1131         //Copy Tracker Object to GPU Memory\r
1132         if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");\r
1133         CudaFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));\r
1134 \r
1135         //Copy Data to GPU Global Memory\r
1136         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
1137         {\r
1138                 CudaFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));\r
1139                 //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize());\r
1140         }\r
1141         //CudaFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));\r
1142         //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0)));\r
1143         CudaFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));\r
1144         CudaFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));\r
1145 \r
1146         if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");\r
1147         AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows());\r
1148         if (CUDASync("Neighbours finder", 0, firstSlice)) return 1;\r
1149         StandalonePerfTime(firstSlice, 2);\r
1150         if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");\r
1151         AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2);\r
1152         if (CUDASync("Neighbours Cleaner", 0, firstSlice)) return 1;\r
1153         StandalonePerfTime(firstSlice, 3);\r
1154         if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");\r
1155         AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6);\r
1156         if (CUDASync("Start Hits Finder", 0, firstSlice)) return 1;\r
1157         StandalonePerfTime(firstSlice, 4);\r
1158 \r
1159         ClearPPHitWeights <<<30, 256>>>(sliceCountLocal);\r
1160         if (CUDASync("Clear Hit Weights", 0, firstSlice)) return 1;\r
1161 \r
1162         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
1163         {\r
1164                 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());\r
1165         }\r
1166 \r
1167         StandalonePerfTime(firstSlice, 7);\r
1168 \r
1169         if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");\r
1170         AliHLTTPCCATrackletConstructorGPUPP<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal);\r
1171         if (CUDASync("Tracklet Constructor PP", 0, firstSlice)) return 1;\r
1172         \r
1173         StandalonePerfTime(firstSlice, 8);\r
1174 \r
1175         AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal);\r
1176         if (CUDASync("Tracklet Selector", 0, firstSlice)) return 1;\r
1177         StandalonePerfTime(firstSlice, 9);\r
1178 \r
1179         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));\r
1180 \r
1181         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
1182         {\r
1183                 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");\r
1184 \r
1185                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));\r
1186                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));\r
1187 \r
1188                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)\r
1189                 {\r
1190                         HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);\r
1191                         return(1);\r
1192                 }\r
1193                 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());\r
1194 \r
1195                 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);\r
1196 #ifdef HLTCA_GPU_TIME_PROFILE\r
1197                 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
1198 #endif\r
1199                 fSlaveTrackers[firstSlice + iSlice].WriteOutput();\r
1200 #ifdef HLTCA_GPU_TIME_PROFILE\r
1201                 AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
1202                 printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
1203 #endif\r
1204         }\r
1205 \r
1206         StandalonePerfTime(firstSlice, 10);\r
1207 \r
1208         if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");\r
1209 \r
1210         return(0);\r
1211 }\r
1212 \r
1213 int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)\r
1214 {\r
1215         //Initialize Slice Tracker Parameter for a slave tracker\r
1216         fSlaveTrackers[iSlice].Initialize(param);\r
1217         if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)\r
1218         {\r
1219                 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);\r
1220                 return(1);\r
1221         }\r
1222         return(0);\r
1223 }\r
1224 \r
1225 int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()\r
1226 {\r
1227         //Uninitialize CUDA\r
1228         cudaThreadSynchronize();\r
1229         if (fGPUMemory)\r
1230         {\r
1231                 cudaFree(fGPUMemory);\r
1232                 fGPUMemory = NULL;\r
1233         }\r
1234         if (fHostLockedMemory)\r
1235         {\r
1236                 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)\r
1237                 {\r
1238                         cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);\r
1239                 }\r
1240                 free(fpCudaStreams);\r
1241                 fGpuTracker = NULL;\r
1242                 cudaFreeHost(fHostLockedMemory);\r
1243         }\r
1244 \r
1245         if (CudaFailedMsg(cudaThreadExit()))\r
1246         {\r
1247                 HLTError("Could not uninitialize GPU");\r
1248                 return(1);\r
1249         }\r
1250         HLTInfo("CUDA Uninitialized");\r
1251         fgGPUUsed = false;\r
1252         fCudaInitialized = 0;\r
1253         return(0);\r
1254 }\r
1255 \r
1256 void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)\r
1257 {\r
1258         //Set Output Control Pointers\r
1259         fOutputControl = val;\r
1260         for (int i = 0;i < fgkNSlices;i++)\r
1261         {\r
1262                 fSlaveTrackers[i].SetOutputControl(val);\r
1263         }\r
1264 }\r
1265 \r
1266 int AliHLTTPCCAGPUTrackerNVCC::GetThread()\r
1267 {\r
1268         //Get Thread ID\r
1269 #ifdef R__WIN32\r
1270         return((int) (size_t) GetCurrentThread());\r
1271 #else\r
1272         return((int) syscall (SYS_gettid));\r
1273 #endif\r
1274 }\r
1275 \r
1276 unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)\r
1277 {\r
1278         //Returns pointer to PerfTimer i of slice iSlice\r
1279         return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);\r
1280 }\r
1281 \r
1282 const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const\r
1283 {\r
1284         //Return Pointer to Output Control Structure\r
1285         return fOutputControl;\r
1286 }\r
1287 \r
1288 int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const\r
1289 {\r
1290         //Return max slice count processable\r
1291         return(fSliceCount);\r
1292 }\r
1293 \r
1294 AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()\r
1295 {\r
1296         return new AliHLTTPCCAGPUTrackerNVCC;\r
1297 }\r
1298 \r
1299 void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)\r
1300 {\r
1301         delete ptr;\r
1302 }\r
1303 \r