]> git.uio.no Git - u/mrichter/AliRoot.git/blob - HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu
GPU update from David Rohr
[u/mrichter/AliRoot.git] / HLT / TPCLib / tracking-ca / AliHLTTPCCAGPUTrackerNVCC.cu
1 // **************************************************************************
2 // This file is property of and copyright by the ALICE HLT Project          *
3 // ALICE Experiment at CERN, All rights reserved.                           *
4 //                                                                          *
5 // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
6 //                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
7 //                                      David Rohr <drohr@kip.uni-heidelberg.de>                                *
8 //                  for The ALICE HLT Project.                              *
9 //                                                                          *
10 // Permission to use, copy, modify and distribute this software and its     *
11 // documentation strictly for non-commercial purposes is hereby granted     *
12 // without fee, provided that the above copyright notice appears in all     *
13 // copies and that both the copyright notice and this permission notice     *
14 // appear in the supporting documentation. The authors make no claims       *
15 // about the suitability of this software for any purpose. It is            *
16 // provided "as is" without express or implied warranty.                    *
17 //                                                                          *
18 //***************************************************************************
19
20 #include "AliHLTTPCCAGPUTracker.h"
21
22 #ifdef BUILD_GPU
23
24 #include <cuda.h>
25 #include <sys/syscall.h>
26
27 #include "AliHLTTPCCADef.h"
28 #include "AliHLTTPCCAGPUConfig.h"
29
30 #include <sm_11_atomic_functions.h>
31 #include <sm_12_atomic_functions.h>
32
33 #include <iostream>
34
35 //Disable assertions since they produce errors in GPU Code
36 #ifdef assert
37 #undef assert
38 #endif
39 #define assert(param)
40
41 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
42 #ifdef HLTCA_GPU_TEXTURE_FETCH
43 texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
44 texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
45 texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
46 #endif
47
48 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
49 #include "AliHLTTPCCATrackParam.cxx"
50 #include "AliHLTTPCCATrack.cxx" 
51
52 #include "AliHLTTPCCAHitArea.cxx"
53 #include "AliHLTTPCCAGrid.cxx"
54 #include "AliHLTTPCCARow.cxx"
55 #include "AliHLTTPCCAParam.cxx"
56 #include "AliHLTTPCCATracker.cxx"
57
58 #include "AliHLTTPCCAProcess.h"
59
60 #include "AliHLTTPCCATrackletSelector.cxx"
61 #include "AliHLTTPCCANeighboursFinder.cxx"
62 #include "AliHLTTPCCANeighboursCleaner.cxx"
63 #include "AliHLTTPCCAStartHitsFinder.cxx"
64 #include "AliHLTTPCCAStartHitsSorter.cxx"
65 #include "AliHLTTPCCATrackletConstructor.cxx"
66 #include "AliHLTTPCCASliceOutput.cxx"
67
68 #include "MemoryAssignmentHelpers.h"
69
70 #ifndef HLTCA_STANDALONE
71 #include "AliHLTDefinitions.h"
72 #include "AliHLTSystem.h"
73 #endif
74
75 ClassImp( AliHLTTPCCAGPUTracker )
76
77 bool AliHLTTPCCAGPUTracker::fgGPUUsed = false;
78
79 int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
80 {
81         //Find best CUDA device, initialize and allocate memory
82         
83         if (fgGPUUsed)
84         {
85             HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
86             return(1);
87         }
88         fgGPUUsed = 1;
89         fThreadId = (int) syscall (SYS_gettid);
90
91         cudaDeviceProp fCudaDeviceProp;
92
93         fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
94
95 #ifndef CUDA_DEVICE_EMULATION
96         int count, bestDevice = -1;
97         long long int bestDeviceSpeed = 0, deviceSpeed;
98         if (CudaFailedMsg(cudaGetDeviceCount(&count)))
99         {
100                 HLTError("Error getting CUDA Device Count");
101                 fgGPUUsed = 0;
102                 return(1);
103         }
104         if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
105         for (int i = 0;i < count;i++)
106         {
107                 unsigned int free, total;
108                 cuInit(0);
109                 CUdevice tmpDevice;
110                 cuDeviceGet(&tmpDevice, i);
111                 CUcontext tmpContext;
112                 cuCtxCreate(&tmpContext, 0, tmpDevice);
113                 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
114                 cuCtxDestroy(tmpContext);
115                 CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i));
116
117                 int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;
118
119                 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %d)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
120                 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;
121                 if (deviceOK && deviceSpeed > bestDeviceSpeed)
122                 {
123                         bestDevice = i;
124                         bestDeviceSpeed = deviceSpeed;
125                 }
126         }
127         if (bestDevice == -1)
128         {
129                 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
130                 fgGPUUsed = 0;
131                 return(1);
132         }
133
134   int cudaDevice;
135   if (forceDeviceID == -1)
136           cudaDevice = bestDevice;
137   else
138           cudaDevice = forceDeviceID;
139 #else
140         int cudaDevice = 0;
141 #endif
142
143   cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice ); 
144
145   if (fDebugLevel >= 1)
146   {
147           HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
148           HLTInfo("totalGlobalMem = %d", fCudaDeviceProp.totalGlobalMem);
149           HLTInfo("sharedMemPerBlock = %d", fCudaDeviceProp.sharedMemPerBlock);
150           HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
151           HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
152           HLTInfo("memPitch = %d", fCudaDeviceProp.memPitch);
153           HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
154           HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
155           HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
156           HLTInfo("totalConstMem = %d", fCudaDeviceProp.totalConstMem);
157           HLTInfo("major = %d", fCudaDeviceProp.major);
158           HLTInfo("minor = %d", fCudaDeviceProp.minor);
159           HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);
160           HLTInfo("textureAlignment %d= ", fCudaDeviceProp.textureAlignment);
161   }
162
163   if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
164   {
165         HLTError( "Unsupported CUDA Device" );
166         fgGPUUsed = 0;
167         return(1);
168   }
169
170   if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
171   {
172           HLTError("Insufficiant Tracker Object Memory");
173           fgGPUUsed = 0;
174           return(1);
175   }
176   
177   if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
178   {
179           HLTError("Could not set CUDA Device!");
180           fgGPUUsed = 0;
181           return(1);
182   }
183
184   if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
185   {
186           HLTError("Insufficiant Common Memory");
187           cudaThreadExit();
188           fgGPUUsed = 0;
189           return(1);
190   }
191
192   if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
193   {
194           HLTError("Insufficiant Row Memory");
195           cudaThreadExit();
196           fgGPUUsed = 0;
197           return(1);
198   }
199
200   if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
201   {
202           HLTError("CUDA Memory Allocation Error");
203           cudaThreadExit();
204           fgGPUUsed = 0;
205           return(1);
206   }
207   if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
208   int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
209   if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
210   {
211           cudaFree(fGPUMemory);
212           cudaThreadExit();
213           HLTError("Error allocating Page Locked Host Memory");
214           fgGPUUsed = 0;
215           return(1);
216   }
217   if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
218
219   if (fDebugLevel >= 1)
220   {
221           CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
222   }
223
224   fSliceCount = sliceCount;
225   //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
226   fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
227
228   for (int i = 0;i < fgkNSlices;i++)
229   {
230     fSlaveTrackers[i].SetGPUTracker();
231         fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
232         fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
233   }
234
235   fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
236   cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
237   for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
238   {
239         if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
240         {
241             cudaFree(fGPUMemory);
242             cudaFreeHost(fHostLockedMemory);
243             cudaThreadExit();
244             HLTError("Error creating CUDA Stream");
245             fgGPUUsed = 0;
246             return(1);
247         }
248   }
249
250   fCudaInitialized = 1;
251   HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %dd)", cudaDevice, fCudaDeviceProp.name, fThreadId);
252
253 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
254   if (fDebugLevel < 2)
255   {
256           //Do one initial run for Benchmark reasons
257           const int useDebugLevel = fDebugLevel;
258           fDebugLevel = 0;
259           AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
260
261           std::ifstream fin;
262
263           AliHLTTPCCAParam tmpParam;
264           AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
265
266           fin.open("events/settings.dump");
267           int tmpCount;
268           fin >> tmpCount;
269           for (int i = 0;i < sliceCount;i++)
270           {
271                 fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
272                 tmpParam.ReadSettings(fin);
273                 InitializeSliceParam(i, tmpParam);
274           }
275           fin.close();
276
277           fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
278           for (int i = 0;i < sliceCount;i++)
279           {
280                 tmpCluster[i].StartReading(i, 0);
281                 tmpCluster[i].ReadEvent(fin);
282                 tmpCluster[i].FinishReading();
283           }
284           fin.close();
285
286           AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
287           memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
288
289           Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
290           for (int i = 0;i < sliceCount;i++)
291           {
292                   free(tmpOutput[i]);
293                   tmpOutput[i] = NULL;
294                   fSlaveTrackers[i].SetOutputControl(NULL);
295           }
296           delete[] tmpOutput;
297           delete[] tmpCluster;
298           fDebugLevel = useDebugLevel;
299   }
300 #endif
301   return(0);
302 }
303
304 template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment)
305 {
306         //Macro to align Pointers.
307         //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
308         //(As long as every single data structure is <= 1 MB)
309
310         size_t adr = (size_t) ptr;
311         if (adr % alignment)
312         {
313                 adr += alignment - (adr % alignment);
314         }
315         return((T*) adr);
316 }
317
318 bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
319 {
320         //Check for CUDA Error and in the case of an error display the corresponding error string
321         if (error == cudaSuccess) return(false);
322         HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
323         return(true);
324 }
325
326 int AliHLTTPCCAGPUTracker::CUDASync(char* state)
327 {
328         //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
329
330         if (fDebugLevel == 0) return(0);
331         cudaError cuErr;
332         cuErr = cudaGetLastError();
333         if (cuErr != cudaSuccess)
334         {
335                 HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state);
336                 return(1);
337         }
338         if (CudaFailedMsg(cudaThreadSynchronize()))
339         {
340                 HLTError("CUDA Error while synchronizing (%s)", state);
341                 return(1);
342         }
343         if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
344         return(0);
345 }
346
347 void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
348 {
349         //Set Debug Level and Debug output File if applicable
350         fDebugLevel = dwLevel;
351         if (NewOutFile) fOutFile = NewOutFile;
352 }
353
354 int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
355 {
356         //Set a specific GPU Tracker Option
357         {
358                 HLTError("Unknown Option: %s", OptionName);
359                 return(1);
360         }
361         //No Options used at the moment
362         //return(0);
363 }
364
365 #ifdef HLTCA_STANDALONE
366 void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
367 {
368   //Run Performance Query for timer i of slice iSlice
369   if (fDebugLevel >= 1)
370   {
371           AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
372   }
373 }
374 #else
375 void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
376 #endif
377
378 void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
379 {
380         //Dump Rowblocks to File
381         if (fDebugLevel >= 4)
382         {
383                 *fOutFile << "RowBlock Tracklets" << std::endl;
384         
385                 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
386                 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
387                 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
388                 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
389                 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
390                 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
391                 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
392
393                 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
394                 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
395                 {
396                         *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << 
397                                 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
398                         for (int j = 0;j < rowBlockPos[i].x;j++)
399                         {
400                                 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
401                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
402                                 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
403                                 {
404                                         HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
405                                 }
406                                 k++;
407                                 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
408                                 {
409                                         HLTError("Error, -1 Tracklet found");
410                                 }
411                         }
412                         *fOutFile << std::endl << "Phase 2: ";
413                         for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
414                         {
415                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
416                         }
417                         *fOutFile << std::endl;
418                 }
419
420                 if (check)
421                 {
422                         *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
423                         for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
424                         {
425                                 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
426                         }
427                 }
428
429                 free(rowBlockPos);
430                 free(rowBlockTracklets);
431                 free(blockStartingTracklet);
432         }
433 }
434
435 __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
436 {
437         //Initialize GPU RowBlocks and HitWeights
438         int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
439         int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
440         const int stride = blockDim.x * gridDim.x;
441         int4 i0, i1;
442         i0.x = i0.y = i0.z = i0.w = 0;
443         i1.x = i1.y = i1.z = i1.w = -1;
444         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
445                 RowBlockPos[i] = i0;
446         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
447                 rowBlockTracklets4[i] = i1;
448         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
449                 sliceDataHitWeights4[i] = i0;
450 }
451
452 int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
453 {
454         //Primary reconstruction function
455         cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
456
457         if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
458         
459         if (!fCudaInitialized)
460         {
461             HLTError("GPUTracker not initialized");
462             return(1);
463         }
464         if (sliceCountLocal > fSliceCount)
465         {
466             HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
467             return(1);
468         }
469         if (fThreadId != (int) syscall (SYS_gettid))
470         {
471             HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, (int) syscall (SYS_gettid));
472             return(1);
473         }
474
475         if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
476
477         if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
478         {
479                 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
480                 return(1);
481         }
482
483         if (fDebugLevel >= 4)
484         {
485                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
486                 {
487                         *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
488                 }
489         }
490
491         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
492
493         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
494
495 #ifdef HLTCA_GPU_TIME_PROFILE
496         unsigned __int64 a, b, c, d;
497         AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
498         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
499 #endif
500         
501         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
502         {
503                 //Make this a GPU Tracker
504                 fGpuTracker[iSlice].SetGPUTracker();
505                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
506                 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
507                 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
508
509                 //Set Pointers to GPU Memory
510                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
511
512                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
513                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
514                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
515
516                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
517                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
518                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
519
520                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
521                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
522                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
523
524                 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
525                 {
526                         HLTError("Insufficiant Track Memory");
527                         return(1);
528                 }
529
530                 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
531                 {
532                         HLTError("Insufficiant Global Memory");
533                         return(1);
534                 }
535
536                 //Initialize Startup Constants
537                 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
538                 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
539                 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
540                 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
541                 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
542                 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
543                 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
544                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
545                 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
546         }
547
548 #ifdef HLTCA_GPU_TEXTURE_FETCH
549                 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
550                 size_t offset;
551                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
552                 {
553                         HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
554                         return(1);
555                 }
556                 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
557                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
558                 {
559                         HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
560                         return(1);
561                 }
562                 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
563                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
564                 {
565                         HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
566                         return(1);
567                 }
568 #endif
569
570         //Copy Tracker Object to GPU Memory
571         if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
572 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
573         if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1);
574         CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000));
575 #endif
576         CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
577
578         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
579         {
580                 StandalonePerfTime(firstSlice + iSlice, 0);
581
582                 //Initialize GPU Slave Tracker
583                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
584                 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
585 #ifdef HLTCA_GPU_TIME_PROFILE
586                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
587 #endif
588                 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
589 #ifdef HLTCA_GPU_TIME_PROFILE
590                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
591                 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
592 #endif
593                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
594                 {
595                         HLTError("Insufficiant Slice Data Memory");
596                         return(1);
597                 }
598
599                 /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice())
600                 {
601                         if (fDebugLevel >= 3) HLTInfo("Slice Empty, not running GPU Tracker");
602                         if (sliceCountLocal == 1)
603                                 return(0);
604                 }*/
605
606                 //Initialize temporary memory where needed
607                 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");           
608                 PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
609
610                 //Copy Data to GPU Global Memory
611                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
612                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
613                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
614
615                 if (fDebugLevel >= 4)
616                 {
617                         if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
618                         fSlaveTrackers[firstSlice + iSlice].TrackletMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] );
619                         fSlaveTrackers[firstSlice + iSlice].SetPointersTracklets( HLTCA_GPU_MAX_TRACKLETS );
620                         fSlaveTrackers[firstSlice + iSlice].HitMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100] );
621                         fSlaveTrackers[firstSlice + iSlice].SetPointersHits( pClusterData[iSlice].NumberOfClusters() );
622                 }
623                 
624                 if (CUDASync("Initialization")) return(1);
625                 StandalonePerfTime(firstSlice + iSlice, 1);
626
627                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
628                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
629
630                 if (CUDASync("Neighbours finder")) return 1;
631
632                 StandalonePerfTime(firstSlice + iSlice, 2);
633
634                 if (fDebugLevel >= 4)
635                 {
636                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
637                         fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
638                 }
639
640                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
641                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
642                 if (CUDASync("Neighbours Cleaner")) return 1;
643
644                 StandalonePerfTime(firstSlice + iSlice, 3);
645
646                 if (fDebugLevel >= 4)
647                 {
648                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
649                         fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
650                 }
651
652                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
653                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
654                 if (CUDASync("Start Hits Finder")) return 1;
655
656                 StandalonePerfTime(firstSlice + iSlice, 4);
657
658                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter");
659                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
660                 if (CUDASync("Start Hits Sorter")) return 1;
661
662                 StandalonePerfTime(firstSlice + iSlice, 5);
663
664                 if (fDebugLevel >= 2)
665                 {
666                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
667                         if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
668                         if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
669                         {
670                                 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
671                                 return(1);
672                         }
673                 }
674
675                 if (fDebugLevel >= 4)
676                 {
677                         *fOutFile << "Temporary ";
678                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
679                         fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
680                         uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
681                         CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
682                         *fOutFile << "Start Hits Sort Vector:" << std::endl;
683                         for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
684                         {
685                                 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
686                         }
687                         free(tmpMemory);
688
689                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
690                         fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
691                 }
692
693                 StandalonePerfTime(firstSlice + iSlice, 6);
694                 
695                 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
696         }
697
698         StandalonePerfTime(firstSlice, 7);
699 #ifdef HLTCA_GPU_PREFETCHDATA
700         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
701         {
702                 if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
703                 {
704                         HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
705                         return(1);
706                 }
707                 if (fDebugLevel >= 1)
708                 {
709                         static int infoShown = 0;
710                         if (!infoShown)
711                         {
712                                 HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
713                                 infoShown = 1;
714                         }
715                 }
716         }
717 #endif
718
719         int nHardCollisions = 0;
720
721 RestartTrackletConstructor:
722         if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
723         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
724         {
725                 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
726                 if (CUDASync("Tracklet Initializer")) return 1;
727                 DumpRowBlocks(fSlaveTrackers, iSlice);
728         }
729
730         if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
731         AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
732         if (CUDASync("Tracklet Constructor (new)")) return 1;
733         
734         StandalonePerfTime(firstSlice, 8);
735
736         if (fDebugLevel >= 4)
737         {
738                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
739                 {
740                         DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
741                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
742                         if (fDebugLevel >= 5)
743                         {
744                                 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
745                         }
746                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
747                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
748                         fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
749                 }
750         }
751
752         int runSlices = 0;
753         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
754         {
755                 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
756                 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
757                 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
758         }
759         if (CUDASync("Tracklet Selector")) return 1;
760         StandalonePerfTime(firstSlice, 9);
761
762         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0]));
763         for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++)
764         {
765                 if (iSliceTmp < sliceCountLocal)
766                 {
767                         int iSlice = iSliceTmp;
768                         if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
769                         cudaStreamSynchronize(cudaStreams[iSlice]);
770                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
771                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
772                         if (iSlice + 1 < sliceCountLocal)
773                                 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1]));
774                 }
775
776                 if (iSliceTmp)
777                 {
778                         int iSlice = iSliceTmp - 1;
779                         cudaStreamSynchronize(cudaStreams[iSlice]);
780
781                         if (fDebugLevel >= 4)
782                         {
783                                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
784                                 fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
785                                 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
786                         }
787
788                         if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
789                         {
790                                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION && nHardCollisions++ < 10)
791                                 {
792                                         HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");
793                                         for (int i = 0;i < sliceCountLocal;i++)
794                                         {
795                                                 cudaThreadSynchronize();
796                                                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
797                                                 *fSlaveTrackers[firstSlice + i].NTracks() = 0;
798                                                 *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
799                                                 fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
800                                                 CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
801                                                 PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
802                                         }
803                                         goto RestartTrackletConstructor;
804                                 }
805                                 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
806                                 return(1);
807                         }
808                         if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
809
810                         fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
811 #ifdef HLTCA_GPU_TIME_PROFILE
812                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
813 #endif
814                         fSlaveTrackers[firstSlice + iSlice].WriteOutput();
815 #ifdef HLTCA_GPU_TIME_PROFILE
816                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
817                         printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
818 #endif
819
820                         if (fDebugLevel >= 4)
821                         {
822                                 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
823                                 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
824                         }
825                 }
826         }
827
828         StandalonePerfTime(firstSlice, 10);
829
830         if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
831
832 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
833         char* stageAtSync = (char*) malloc(100000000);
834         CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
835         cudaFree(fGpuTracker[0].fStageAtSync);
836
837         FILE* fp = fopen("profile.txt", "w+");
838         FILE* fp2 = fopen("profile.bmp", "w+b");
839         int nEmptySync = 0, fEmpty;
840
841         const int bmpheight = 1000;
842         BITMAPFILEHEADER bmpFH;
843         BITMAPINFOHEADER bmpIH;
844         ZeroMemory(&bmpFH, sizeof(bmpFH));
845         ZeroMemory(&bmpIH, sizeof(bmpIH));
846         
847         bmpFH.bfType = 19778; //"BM"
848         bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
849         bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
850
851         bmpIH.biSize = sizeof(bmpIH);
852         bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
853         bmpIH.biHeight = bmpheight;
854         bmpIH.biPlanes = 1;
855         bmpIH.biBitCount = 32;
856
857         fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
858         fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);  
859
860         for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
861         {
862                 fEmpty = 1;
863                 for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
864                 {
865                         fprintf(fp, "%d\t", stageAtSync[i + j]);
866                         int color = 0;
867                         if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
868                         if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
869                         if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
870                         if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
871                         fwrite(&color, 1, sizeof(int), fp2);
872                         if (j > 0 && j % 32 == 0)
873                         {
874                                 color = RGB(255, 255, 255);
875                                 fwrite(&color, 1, 4, fp2);
876                         }
877                         if (stageAtSync[i + j]) fEmpty = 0;
878                 }
879                 fprintf(fp, "\n");
880                 if (fEmpty) nEmptySync++;
881                 else nEmptySync = 0;
882                 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
883         }
884
885         fclose(fp);
886         fclose(fp2);
887         free(stageAtSync);
888 #endif 
889
890         return(0);
891 }
892
893 int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
894 {
895         //Initialize Slice Tracker Parameter for a slave tracker
896         fSlaveTrackers[iSlice].Initialize(param);
897         if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
898         {
899                 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
900                 return(1);
901         }
902         return(0);
903 }
904
905 int AliHLTTPCCAGPUTracker::ExitGPU()
906 {
907         //Uninitialize CUDA
908         cudaThreadSynchronize();
909         if (fGPUMemory)
910         {
911                 cudaFree(fGPUMemory);
912                 fGPUMemory = NULL;
913         }
914         if (fHostLockedMemory)
915         {
916                 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
917                 {
918                         cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
919                 }
920                 free(fpCudaStreams);
921                 fGpuTracker = NULL;
922                 cudaFreeHost(fHostLockedMemory);
923         }
924
925         if (CudaFailedMsg(cudaThreadExit()))
926         {
927                 HLTError("Could not uninitialize GPU");
928                 return(1);
929         }
930         HLTInfo("CUDA Uninitialized");
931         fgGPUUsed = false;
932         fCudaInitialized = 0;
933         return(0);
934 }
935
936 void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
937 {
938         fOutputControl = val;
939         for (int i = 0;i < fgkNSlices;i++)
940         {
941                 fSlaveTrackers[i].SetOutputControl(val);
942         }
943 }
944
945 #endif