]> git.uio.no Git - u/mrichter/AliRoot.git/blob - HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu
7b2874c9b6fc9e4505c158537691e91897671e7d
[u/mrichter/AliRoot.git] / HLT / TPCLib / tracking-ca / cagpu / AliHLTTPCCAGPUTrackerNVCC.cu
1 // **************************************************************************
2 // This file is property of and copyright by the ALICE HLT Project          *
3 // ALICE Experiment at CERN, All rights reserved.                           *
4 //                                                                          *
5 // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
6 //                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
7 //                                      David Rohr <drohr@kip.uni-heidelberg.de>                                *
8 //                  for The ALICE HLT Project.                              *
9 //                                                                          *
10 // Permission to use, copy, modify and distribute this software and its     *
11 // documentation strictly for non-commercial purposes is hereby granted     *
12 // without fee, provided that the above copyright notice appears in all     *
13 // copies and that both the copyright notice and this permission notice     *
14 // appear in the supporting documentation. The authors make no claims       *
15 // about the suitability of this software for any purpose. It is            *
16 // provided "as is" without express or implied warranty.                    *
17 //                                                                          *
18 //***************************************************************************
19
20 #define HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT 36
21 #define FERMI
22 #include "AliHLTTPCCAGPUTrackerNVCC.h"
23
24 #ifdef HLTCA_GPUCODE
25 #include <cuda.h>
26 #include <sm_11_atomic_functions.h>
27 #include <sm_12_atomic_functions.h>
28 #endif
29
30 #ifdef R__WIN32
31 #else
32 #include <sys/syscall.h>
33 #include <semaphore.h>
34 #include <fcntl.h>
35 #endif
36 #include "AliHLTTPCCADef.h"
37 #include "AliHLTTPCCAGPUConfig.h"
38
39 #if defined(HLTCA_STANDALONE) & !defined(_WIN32)
40 #include <sched.h>
41 #endif
42
43 #include <iostream>
44 #include <fstream>
45
46 //Disable assertions since they produce errors in GPU Code
47 #ifdef assert
48 #undef assert
49 #endif
50 #define assert(param)
51
52 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
53 #ifdef HLTCA_GPU_TEXTURE_FETCH
54 texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
55 texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
56 texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
57 #endif
58
59 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
60 #include "AliHLTTPCCATrackParam.cxx"
61 #include "AliHLTTPCCATrack.cxx" 
62
63 #include "AliHLTTPCCAHitArea.cxx"
64 #include "AliHLTTPCCAGrid.cxx"
65 #include "AliHLTTPCCARow.cxx"
66 #include "AliHLTTPCCAParam.cxx"
67 #include "AliHLTTPCCATracker.cxx"
68
69 #include "AliHLTTPCCAProcess.h"
70
71 #include "AliHLTTPCCATrackletSelector.cxx"
72 #include "AliHLTTPCCANeighboursFinder.cxx"
73 #include "AliHLTTPCCANeighboursCleaner.cxx"
74 #include "AliHLTTPCCAStartHitsFinder.cxx"
75 #include "AliHLTTPCCAStartHitsSorter.cxx"
76 #include "AliHLTTPCCATrackletConstructor.cxx"
77
78 #ifdef HLTCA_GPU_MERGER
79 #include "AliHLTTPCGMMerger.h"
80 #include "AliHLTTPCGMTrackParam.cxx"
81 #endif
82
83 #include "MemoryAssignmentHelpers.h"
84
85 #ifndef HLTCA_STANDALONE
86 #include "AliHLTDefinitions.h"
87 #include "AliHLTSystem.h"
88 #endif
89
90 #define RANDOM_ERROR
91 //#define RANDOM_ERROR || rand() % 500 == 1
92
93 ClassImp( AliHLTTPCCAGPUTrackerNVCC )
94
95 int AliHLTTPCCAGPUTrackerNVCC::GlobalTracking(int iSlice, int threadId, AliHLTTPCCAGPUTrackerNVCC::helperParam* hParam)
96 {
97         if (fDebugLevel >= 3) printf("GPU Tracker running Global Tracking for slice %d on thread %d\n", iSlice, threadId);
98
99         int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
100         int sliceRight = (iSlice + 1) % (fgkNSlices / 2);
101         if (iSlice >= fgkNSlices / 2)
102         {
103                 sliceLeft += fgkNSlices / 2;
104                 sliceRight += fgkNSlices / 2;
105         }
106         while (fSliceOutputReady < iSlice || fSliceOutputReady < sliceLeft || fSliceOutputReady < sliceRight)
107         {
108                 if (hParam != NULL && hParam->fReset) return(1);
109         }
110
111         pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
112         pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
113         fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS);
114         pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]);
115         pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]);
116
117         fSliceLeftGlobalReady[sliceLeft] = 1;
118         fSliceRightGlobalReady[sliceRight] = 1;
119         if (fDebugLevel >= 3) printf("GPU Tracker finished Global Tracking for slice %d on thread %d\n", iSlice, threadId);
120         return(0);
121 }
122
123 void* AliHLTTPCCAGPUTrackerNVCC::helperWrapper(void* arg)
124 {
125         AliHLTTPCCAGPUTrackerNVCC::helperParam* par = (AliHLTTPCCAGPUTrackerNVCC::helperParam*) arg;
126         AliHLTTPCCAGPUTrackerNVCC* cls = par->fCls;
127
128         AliHLTTPCCATracker* tmpTracker = new AliHLTTPCCATracker;
129
130 #ifdef HLTCA_STANDALONE
131         if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d starting", par->fNum);
132 #endif
133
134 #if defined(HLTCA_STANDALONE) & !defined(_WIN32)
135         cpu_set_t mask;
136         CPU_ZERO(&mask);
137         CPU_SET(par->fNum * 2 + 2, &mask);
138         //sched_setaffinity(0, sizeof(mask), &mask);
139 #endif
140
141         while(pthread_mutex_lock(&((pthread_mutex_t*) par->fMutex)[0]) == 0 && par->fTerminate == false)
142         {
143                 if (par->CPUTracker)
144                 {
145                         for (int i = 0;i < cls->fNSlicesPerCPUTracker;i++)
146                         {
147                                 int myISlice = cls->fSliceCount - cls->fNCPUTrackers * cls->fNSlicesPerCPUTracker + (par->fNum - cls->fNHelperThreads) * cls->fNSlicesPerCPUTracker + i;
148 #ifdef HLTCA_STANDALONE
149                                 if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Doing full CPU tracking, Slice %d", par->fNum, myISlice);
150 #endif
151                                 if (myISlice >= 0)
152                                 {
153                                         tmpTracker->Initialize(cls->fSlaveTrackers[par->fFirstSlice + myISlice].Param());
154                                         tmpTracker->ReadEvent(&par->pClusterData[myISlice]);
155                                         tmpTracker->DoTracking();
156                                         tmpTracker->SetOutput(&par->pOutput[myISlice]);
157                                         pthread_mutex_lock((pthread_mutex_t*) cls->fHelperMemMutex);
158                                         tmpTracker->WriteOutputPrepare();
159                                         pthread_mutex_unlock((pthread_mutex_t*) cls->fHelperMemMutex);
160                                         tmpTracker->WriteOutput();
161
162                                         /*cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUSliceDataMemory((char*) new uint4[HLTCA_GPU_SLICE_DATA_MEMORY/sizeof(uint4)], (char*) new uint4[HLTCA_GPU_ROWS_MEMORY/sizeof(uint4)]);
163                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].ReadEvent(&par->pClusterData[myISlice]);
164                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracklets(HLTCA_GPU_MAX_TRACKLETS);
165                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersHits(par->pClusterData[myISlice].NumberOfClusters());
166                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracks(HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
167                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, cls->fConstructorBlockCount);
168                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemorySize()/sizeof( uint4 ) + 100]), par->pClusterData[myISlice].NumberOfClusters());
169                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTracksMemory(reinterpret_cast<char*> ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemorySize()/sizeof( uint4 ) + 100]), HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters());
170                                         cls->fSlaveTrackers[par->fFirstSlice + myISlice].DoTracking();
171                                         cls->WriteOutput(par->pOutput, par->fFirstSlice, myISlice, par->fNum + 1);
172                                         delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemory();
173                                         delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemory();
174                                         delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemory();*/
175                                 }
176 #ifdef HLTCA_STANDALONE
177                                 if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d", par->fNum, myISlice);
178 #endif
179                         }
180                 }
181                 else
182                 {
183                         int mustRunSlice19 = 0;
184                         for (int i = par->fNum + 1;i < par->fSliceCount;i += cls->fNHelperThreads + 1)
185                         {
186                                 //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Running, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
187                                 if (par->fPhase)
188                                 {
189                                         if (cls->fUseGlobalTracking)
190                                         {
191                                                 int realSlice = i + 1;
192                                                 if (realSlice % (fgkNSlices / 2) < 1) realSlice -= fgkNSlices / 2;
193
194                                                 if (realSlice % (fgkNSlices / 2) != 1)
195                                                 {
196                                                         cls->GlobalTracking(realSlice, par->fNum + 1, par);
197                                                 }
198
199                                                 if (realSlice == 19)
200                                                 {
201                                                         mustRunSlice19 = 1;
202                                                 }
203                                                 else
204                                                 {
205                                                         while (cls->fSliceLeftGlobalReady[realSlice] == 0 || cls->fSliceRightGlobalReady[realSlice] == 0)
206                                                         {
207                                                                 if (par->fReset) goto ResetHelperThread;
208                                                         }
209                                                         cls->WriteOutput(par->pOutput, par->fFirstSlice, realSlice, par->fNum + 1);
210                                                 }
211                                         }
212                                         else
213                                         {
214                                                 while (cls->fSliceOutputReady < i)
215                                                 {
216                                                         if (par->fReset) goto ResetHelperThread;
217                                                 }
218                                                 cls->WriteOutput(par->pOutput, par->fFirstSlice, i, par->fNum + 1);
219                                         }
220                                 }
221                                 else
222                                 {
223                                         cls->ReadEvent(par->pClusterData, par->fFirstSlice, i, par->fNum + 1);
224                                         par->fDone = i + 1;
225                                 }
226                                 //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase);
227                         }
228                         if (mustRunSlice19)
229                         {
230                                 while (cls->fSliceLeftGlobalReady[19] == 0 || cls->fSliceRightGlobalReady[19] == 0)
231                                 {
232                                         if (par->fReset) goto ResetHelperThread;
233                                 }
234                                 cls->WriteOutput(par->pOutput, par->fFirstSlice, 19, par->fNum + 1);
235                         }
236                 }
237 ResetHelperThread:
238                 cls->ResetThisHelperThread(par);
239         }
240 #ifdef HLTCA_STANDALONE
241         if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d terminating", par->fNum);
242 #endif
243         delete tmpTracker;
244         pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
245         pthread_exit(NULL);
246         return(NULL);
247 }
248
249 void AliHLTTPCCAGPUTrackerNVCC::ResetThisHelperThread(AliHLTTPCCAGPUTrackerNVCC::helperParam* par)
250 {
251         if (par->fReset) HLTImportant("GPU Helper Thread %d reseting", par->fNum);
252         par->fReset = false;
253         pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]);
254 }
255
256 #define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
257
258 AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :
259 fGpuTracker(NULL),
260 fGPUMemory(NULL),
261 fHostLockedMemory(NULL),
262 fGPUMergerMemory(NULL),
263 fGPUMergerHostMemory(NULL),
264 fGPUMergerMaxMemory(0),
265 fDebugLevel(0),
266 fDebugMask(0xFFFFFFFF),
267 fOutFile(NULL),
268 fGPUMemSize(0),
269 fpCudaStreams(NULL),
270 fSliceCount(HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT),
271 fCudaDevice(0),
272 fOutputControl(NULL),
273 fThreadId(0),
274 fCudaInitialized(0),
275 fPPMode(0),
276 fSelfheal(0),
277 fConstructorBlockCount(30),
278 selectorBlockCount(30),
279 fCudaContext(NULL),
280 fNHelperThreads(HLTCA_GPU_DEFAULT_HELPER_THREADS),
281 fHelperParams(NULL),
282 fHelperMemMutex(NULL),
283 fSliceOutputReady(0),
284 fSliceGlobalMutexes(NULL),
285 fNCPUTrackers(0),
286 fNSlicesPerCPUTracker(0),
287 fGlobalTracking(0),
288 fUseGlobalTracking(0),
289 fNSlaveThreads(0)
290 {
291         fCudaContext = (void*) new CUcontext;
292 };
293
294 AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC()
295 {
296         delete (CUcontext*) fCudaContext;
297 };
298
299 void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)
300 {
301         //Release the global named semaphore that locks GPU Initialization
302 #ifdef R__WIN32
303         HANDLE* h = (HANDLE*) sem;
304         ReleaseSemaphore(*h, 1, NULL);
305         CloseHandle(*h);
306         delete h;
307 #else
308         sem_t* pSem = (sem_t*) sem;
309         sem_post(pSem);
310         sem_unlink(SemLockName);
311 #endif
312 }
313
314 int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)
315 {
316         //Check constants for correct memory sizes
317         if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
318         {
319                 HLTError("Insufficiant Tracker Object Memory for %d slices", sliceCount);
320                 return(1);
321         }
322
323         if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
324         {
325                 HLTError("Insufficiant Common Memory");
326                 return(1);
327         }
328
329         if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
330         {
331                 HLTError("Insufficiant Row Memory");
332                 return(1);
333         }
334
335         if (fDebugLevel >= 3)
336         {
337                 HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", (int) sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, (int) (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize()), HLTCA_GPU_COMMON_MEMORY, (int) (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow)), HLTCA_GPU_ROWS_MEMORY);
338         }
339         return(0);
340 }
341
342 int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
343 {
344         //Find best CUDA device, initialize and allocate memory
345
346 #if defined(HLTCA_STANDALONE) & !defined(_WIN32)
347         cpu_set_t mask;
348         CPU_ZERO(&mask);
349         CPU_SET(0, &mask);
350         //sched_setaffinity(0, sizeof(mask), &mask);
351 #endif
352
353         if (sliceCount == -1) sliceCount = fSliceCount;
354
355         if (CheckMemorySizes(sliceCount)) return(1);
356
357 #ifdef R__WIN32
358         HANDLE* semLock = new HANDLE;
359         *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
360         if (*semLock == NULL)
361         {
362                 HLTError("Error creating GPUInit Semaphore");
363                 return(1);
364         }
365         WaitForSingleObject(*semLock, INFINITE);
366 #else
367         sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
368         if (semLock == SEM_FAILED)
369         {
370                 HLTError("Error creating GPUInit Semaphore");
371                 return(1);
372         }
373         timespec semtime;
374         clock_gettime(CLOCK_REALTIME, &semtime);
375         semtime.tv_sec += 10;
376         while (sem_timedwait(semLock, &semtime) != 0)
377         {
378                 HLTError("Global Lock for GPU initialisation was not released for 10 seconds, assuming another thread died");
379                 HLTWarning("Resetting the global lock");
380                 sem_post(semLock);
381         }
382 #endif
383
384         fThreadId = GetThread();
385
386         cudaDeviceProp fCudaDeviceProp;
387
388         fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
389
390 #ifdef HLTCA_GPU_MERGER
391         fGPUMergerMaxMemory = 2000000 * 5 * sizeof(float);
392         fGPUMemSize += fGPUMergerMaxMemory;
393 #endif
394
395 #ifndef CUDA_DEVICE_EMULATION
396         int count, bestDevice = -1;
397         long long int bestDeviceSpeed = 0, deviceSpeed;
398         if (CudaFailedMsg(cudaGetDeviceCount(&count)))
399         {
400                 HLTError("Error getting CUDA Device Count");
401                 ReleaseGlobalLock(semLock);
402                 return(1);
403         }
404         if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
405 #ifdef FERMI
406         const int reqVerMaj = 2;
407         const int reqVerMin = 0;
408 #else
409         const int reqVerMaj = 1;
410         const int reqVerMin = 2;
411 #endif
412         for (int i = 0;i < count;i++)
413         {
414                 if (fDebugLevel >= 4) printf("Examining device %d\n", i);
415 #if CUDA_VERSION > 3010
416                 size_t free, total;
417 #else
418                 unsigned int free, total;
419 #endif
420                 cuInit(0);
421                 CUdevice tmpDevice;
422                 cuDeviceGet(&tmpDevice, i);
423                 CUcontext tmpContext;
424                 cuCtxCreate(&tmpContext, 0, tmpDevice);
425                 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
426                 cuCtxDestroy(tmpContext);
427                 if (fDebugLevel >= 4) printf("Obtained current memory usage for device %d\n", i);
428                 if (CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;
429                 if (fDebugLevel >= 4) printf("Obtained device properties for device %d\n", i);
430                 int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < reqVerMaj || (fCudaDeviceProp.major == reqVerMaj && fCudaDeviceProp.minor < reqVerMin)) && free >= fGPUMemSize + 100 * 1024 + 1024;
431 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
432                 //if (sliceCount > fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER) deviceOK = 0;
433 #endif
434
435                 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %lld / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, (long long int) free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
436                 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free * (long long int) fCudaDeviceProp.major * (long long int) fCudaDeviceProp.major;
437                 if (deviceOK && deviceSpeed > bestDeviceSpeed)
438                 {
439                         bestDevice = i;
440                         bestDeviceSpeed = deviceSpeed;
441                 }
442         }
443         if (bestDevice == -1)
444         {
445                 HLTWarning("No %sCUDA Device available, aborting CUDA Initialisation", count ? "appropriate " : "");
446                 HLTInfo("Requiring Revision %d.%d, Mem: %lld, Multiprocessors: %d", reqVerMaj, reqVerMin, fGPUMemSize + 100 * 1024 * 1024, sliceCount);
447                 ReleaseGlobalLock(semLock);
448                 return(1);
449         }
450
451         if (forceDeviceID == -1)
452                 fCudaDevice = bestDevice;
453         else
454                 fCudaDevice = forceDeviceID;
455 #else
456         fCudaDevice = 0;
457 #endif
458
459         cudaGetDeviceProperties(&fCudaDeviceProp ,fCudaDevice ); 
460
461         if (fDebugLevel >= 1)
462         {
463                 HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
464                 HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);
465                 HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);
466                 HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
467                 HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
468                 HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);
469                 HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
470                 HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
471                 HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
472                 HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);
473                 HLTInfo("major = %d", fCudaDeviceProp.major);
474                 HLTInfo("minor = %d", fCudaDeviceProp.minor);
475                 HLTInfo("clockRate = %d", fCudaDeviceProp.clockRate);
476                 HLTInfo("memoryClockRate = %d", fCudaDeviceProp.memoryClockRate);
477                 HLTInfo("multiProcessorCount = %d", fCudaDeviceProp.multiProcessorCount);
478                 HLTInfo("textureAlignment = %lld", (unsigned long long int) fCudaDeviceProp.textureAlignment);
479         }
480         fConstructorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER;
481         selectorBlockCount = fCudaDeviceProp.multiProcessorCount * HLTCA_GPU_BLOCK_COUNT_SELECTOR_MULTIPLIER;
482
483         if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
484         {
485                 HLTError( "Unsupported CUDA Device" );
486                 ReleaseGlobalLock(semLock);
487                 return(1);
488         }
489
490         if (cuCtxCreate((CUcontext*) fCudaContext, CU_CTX_SCHED_AUTO, fCudaDevice) != CUDA_SUCCESS)
491         {
492                 HLTError("Could not set CUDA Device!");
493                 ReleaseGlobalLock(semLock);
494                 return(1);
495         }
496
497         if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
498         {
499                 HLTError("CUDA Memory Allocation Error");
500                 cudaThreadExit();
501                 ReleaseGlobalLock(semLock);
502                 return(1);
503         }
504         fGPUMergerMemory = ((char*) fGPUMemory) + fGPUMemSize - fGPUMergerMaxMemory;
505         ReleaseGlobalLock(semLock);
506         if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
507         int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
508 #ifdef HLTCA_GPU_MERGER
509         hostMemSize += fGPUMergerMaxMemory;
510 #endif
511         if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
512         {
513                 cudaFree(fGPUMemory);
514                 cudaThreadExit();
515                 HLTError("Error allocating Page Locked Host Memory");
516                 return(1);
517         }
518         if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
519         fGPUMergerHostMemory = ((char*) fHostLockedMemory) + hostMemSize - fGPUMergerMaxMemory;
520
521         if (fDebugLevel >= 1)
522         {
523                 CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
524         }
525
526         fSliceCount = sliceCount;
527         //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
528         fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
529
530         for (int i = 0;i < fgkNSlices;i++)
531         {
532                 fSlaveTrackers[i].SetGPUTracker();
533                 fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
534                 fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
535         }
536
537         fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
538         cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
539         for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
540         {
541                 if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
542                 {
543                         cudaFree(fGPUMemory);
544                         cudaFreeHost(fHostLockedMemory);
545                         cudaThreadExit();
546                         HLTError("Error creating CUDA Stream");
547                         return(1);
548                 }
549         }
550
551         if (StartHelperThreads()) return(1);
552
553         fHelperMemMutex = malloc(sizeof(pthread_mutex_t));
554         if (fHelperMemMutex == NULL)
555         {
556                 HLTError("Memory allocation error");
557                 cudaFree(fGPUMemory);
558                 cudaFreeHost(fHostLockedMemory);
559                 cudaThreadExit();
560                 return(1);
561         }
562
563         if (pthread_mutex_init((pthread_mutex_t*) fHelperMemMutex, NULL))
564         {
565                 HLTError("Error creating pthread mutex");
566                 cudaFree(fGPUMemory);
567                 cudaFreeHost(fHostLockedMemory);
568                 cudaThreadExit();
569                 return(1);
570         }
571
572         fSliceGlobalMutexes = malloc(sizeof(pthread_mutex_t) * fgkNSlices);
573         if (fSliceGlobalMutexes == NULL)
574         {
575                 HLTError("Memory allocation error");
576                 cudaFree(fGPUMemory);
577                 cudaFreeHost(fHostLockedMemory);
578                 cudaThreadExit();
579                 return(1);
580         }
581         for (int i = 0;i < fgkNSlices;i++)
582         {
583                 if (pthread_mutex_init(&((pthread_mutex_t*) fSliceGlobalMutexes)[i], NULL))
584                 {
585                         HLTError("Error creating pthread mutex");
586                         cudaFree(fGPUMemory);
587                         cudaFreeHost(fHostLockedMemory);
588                         cudaThreadExit();
589                         return(1);
590                 }
591         }
592
593         cuCtxPopCurrent((CUcontext*) fCudaContext);
594         fCudaInitialized = 1;
595         HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", fCudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount);
596
597 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
598         if (fDebugLevel < 2 && 0)
599         {
600                 //Do one initial run for Benchmark reasons
601                 const int useDebugLevel = fDebugLevel;
602                 fDebugLevel = 0;
603                 AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
604
605                 std::ifstream fin;
606
607                 AliHLTTPCCAParam tmpParam;
608                 AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
609
610                 fin.open("events/settings.dump");
611                 int tmpCount;
612                 fin >> tmpCount;
613                 for (int i = 0;i < sliceCount;i++)
614                 {
615                         fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
616                         tmpParam.ReadSettings(fin);
617                         InitializeSliceParam(i, tmpParam);
618                 }
619                 fin.close();
620
621                 fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
622                 for (int i = 0;i < sliceCount;i++)
623                 {
624                         tmpCluster[i].StartReading(i, 0);
625                         tmpCluster[i].ReadEvent(fin);
626                 }
627                 fin.close();
628
629                 AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
630                 memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
631
632                 Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
633                 for (int i = 0;i < sliceCount;i++)
634                 {
635                         free(tmpOutput[i]);
636                         tmpOutput[i] = NULL;
637                         fSlaveTrackers[i].SetOutputControl(NULL);
638                 }
639                 delete[] tmpOutput;
640                 delete[] tmpCluster;
641                 fDebugLevel = useDebugLevel;
642         }
643 #endif
644
645         return(0);
646 }
647
648 int AliHLTTPCCAGPUTrackerNVCC::StartHelperThreads()
649 {
650         int nThreads = fNHelperThreads + fNCPUTrackers;
651         if (nThreads)
652         {
653                 fHelperParams = new helperParam[nThreads];
654                 if (fHelperParams == NULL)
655                 {
656                         HLTError("Memory allocation error");
657                         cudaFree(fGPUMemory);
658                         cudaFreeHost(fHostLockedMemory);
659                         cudaThreadExit();
660                         return(1);
661                 }       
662                 for (int i = 0;i < nThreads;i++)
663                 {
664                         fHelperParams[i].fCls = this;
665                         fHelperParams[i].fTerminate = false;
666                         fHelperParams[i].fReset = false;
667                         fHelperParams[i].fNum = i;
668                         fHelperParams[i].fMutex = malloc(2 * sizeof(pthread_mutex_t));
669                         if (fHelperParams[i].fMutex == NULL)
670                         {
671                                 HLTError("Memory allocation error");
672                                 cudaFree(fGPUMemory);
673                                 cudaFreeHost(fHostLockedMemory);
674                                 cudaThreadExit();
675                                 return(1);
676                         }
677                         for (int j = 0;j < 2;j++)
678                         {
679                                 if (pthread_mutex_init(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j], NULL))
680                                 {
681                                         HLTError("Error creating pthread mutex");
682                                         cudaFree(fGPUMemory);
683                                         cudaFreeHost(fHostLockedMemory);
684                                         cudaThreadExit();
685                                         return(1);
686                                 }
687
688                                 pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
689                         }
690                         fHelperParams[i].fThreadId = (void*) malloc(sizeof(pthread_t));
691
692                         if (pthread_create((pthread_t*) fHelperParams[i].fThreadId, NULL, helperWrapper, &fHelperParams[i]))
693                         {
694                                 HLTError("Error starting slave thread");
695                                 cudaFree(fGPUMemory);
696                                 cudaFreeHost(fHostLockedMemory);
697                                 cudaThreadExit();
698                         }
699                 }
700         }
701         fNSlaveThreads = nThreads;
702         return(0);
703 }
704
705 template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)
706 {
707         //Macro to align Pointers.
708         //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
709         //(As long as every single data structure is <= 1 MB)
710
711         size_t adr = (size_t) ptr;
712         if (adr % alignment)
713         {
714                 adr += alignment - (adr % alignment);
715         }
716         return((T*) adr);
717 }
718
719 bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsgA(cudaError_t error, const char* file, int line)
720 {
721         //Check for CUDA Error and in the case of an error display the corresponding error string
722         if (error == cudaSuccess) return(false);
723         HLTWarning("CUDA Error: %d / %s (%s:%d)", error, cudaGetErrorString(error), file, line);
724         return(true);
725 }
726
727 int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice)
728 {
729         //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
730
731         if (fDebugLevel == 0) return(0);
732         cudaError cuErr;
733         cuErr = cudaGetLastError();
734         if (cuErr != cudaSuccess)
735         {
736                 HLTError("Cuda Error %s while running kernel (%s) (Slice %d; %d/%d)", cudaGetErrorString(cuErr), state, sliceLocal, slice, fgkNSlices);
737                 return(1);
738         }
739         if (CudaFailedMsg(cudaThreadSynchronize()))
740         {
741                 HLTError("CUDA Error while synchronizing (%s) (Slice %d; %d/%d)", state, sliceLocal, slice, fgkNSlices);
742                 return(1);
743         }
744         if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
745         return(0);
746 }
747
748 void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
749 {
750         //Set Debug Level and Debug output File if applicable
751         fDebugLevel = dwLevel;
752         if (NewOutFile) fOutFile = NewOutFile;
753 }
754
755 int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int OptionValue)
756 {
757         //Set a specific GPU Tracker Option
758         if (strcmp(OptionName, "PPMode") == 0)
759         {
760                 fPPMode = OptionValue;
761         }
762         else if (strcmp(OptionName, "DebugMask") == 0)
763         {
764                 fDebugMask = OptionValue;
765         }
766         else if (strcmp(OptionName, "HelperThreads") == 0)
767         {
768                 fNHelperThreads = OptionValue;
769         }
770         else if (strcmp(OptionName, "CPUTrackers") == 0)
771         {
772                 fNCPUTrackers = OptionValue;
773         }
774         else if (strcmp(OptionName, "SlicesPerCPUTracker") == 0)
775         {
776                 fNSlicesPerCPUTracker = OptionValue;
777         }
778         else if (strcmp(OptionName, "GlobalTracking") == 0)
779         {
780                 fGlobalTracking = OptionValue;
781         }
782         else
783         {
784                 HLTError("Unknown Option: %s", OptionName);
785                 return(1);
786         }
787
788         if (fNHelperThreads + fNCPUTrackers > fNSlaveThreads && fCudaInitialized)
789         {
790                 HLTInfo("Insufficient Slave Threads available (%d), creating additional Slave Threads (%d+%d)\n", fNSlaveThreads, fNHelperThreads, fNCPUTrackers);
791                 StopHelperThreads();
792                 StartHelperThreads();
793         }
794
795         return(0);
796 }
797
798 #ifdef HLTCA_STANDALONE
799 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)
800 {
801         //Run Performance Query for timer i of slice iSlice
802         if (fDebugLevel >= 1)
803         {
804                 AliHLTTPCCATracker::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
805         }
806 }
807 #else
808 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
809 #endif
810
811 #if defined(BITWISE_COMPATIBLE_DEBUG_OUTPUT) || defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER)
812 void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker*, int, bool) {}
813 #else
814 void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
815 {
816         //Dump Rowblocks to File
817         if (fDebugLevel >= 4)
818         {
819                 *fOutFile << "RowBlock Tracklets (Slice" << tracker[iSlice].Param().ISlice() << " (" << iSlice << " of reco))";
820                 *fOutFile << " after Tracklet Reconstruction";
821                 *fOutFile << std::endl;
822
823                 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
824                 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
825                 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * fConstructorBlockCount);
826                 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
827                 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
828                 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * fConstructorBlockCount, cudaMemcpyDeviceToHost));
829                 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
830
831                 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
832                 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
833                 {
834                         *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << 
835                                 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
836                         for (int j = 0;j < rowBlockPos[i].x;j++)
837                         {
838                                 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
839                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
840 #ifdef HLTCA_GPU_SCHED_FIXED_START
841                                 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
842                                 {
843                                         HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
844                                 }
845 #endif //HLTCA_GPU_SCHED_FIXED_START
846                                 k++;
847                                 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
848                                 {
849                                         HLTError("Error, -1 Tracklet found");
850                                 }
851                         }
852                         *fOutFile << std::endl << "Phase 2: ";
853                         for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
854                         {
855                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
856                         }
857                         *fOutFile << std::endl;
858                 }
859
860                 if (check)
861                 {
862                         *fOutFile << "Starting Threads: (Slice" << tracker[iSlice].Param().ISlice() << ", First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
863                         for (int i = 0;i < fConstructorBlockCount;i++)
864                         {
865                                 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
866                         }
867                 }
868
869                 free(rowBlockPos);
870                 free(rowBlockTracklets);
871                 free(blockStartingTracklet);
872         }
873 }
874 #endif
875
876 __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
877 {
878         //Initialize GPU RowBlocks and HitWeights
879         int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
880         const int stride = blockDim.x * gridDim.x;
881         int4 i0;
882         i0.x = i0.y = i0.z = i0.w = 0;
883 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
884         int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
885         int4 i1;
886         i1.x = i1.y = i1.z = i1.w = -1;
887         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
888                 RowBlockPos[i] = i0;
889         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
890                 rowBlockTracklets4[i] = i1;
891 #endif
892         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
893                 sliceDataHitWeights4[i] = i0;
894 }
895
896 int AliHLTTPCCAGPUTrackerNVCC::SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
897 {
898         if (!fSelfheal)
899         {
900                 cuCtxPopCurrent((CUcontext*) fCudaContext);
901                 return(1);
902         }
903         static bool selfHealing = false;
904         if (selfHealing)
905         {
906                 HLTError("Selfhealing failed, giving up");
907                 cuCtxPopCurrent((CUcontext*) fCudaContext);
908                 return(1);
909         }
910         else
911         {
912                 HLTError("Unsolvable CUDA error occured, trying to reinitialize GPU");
913         }                       
914         selfHealing = true;
915         ExitGPU();
916         if (InitGPU(fSliceCount, fCudaDevice))
917         {
918                 HLTError("Could not reinitialize CUDA device, disabling GPU tracker");
919                 ExitGPU();
920                 return(1);
921         }
922         HLTInfo("GPU tracker successfully reinitialized, restarting tracking");
923         int retVal = Reconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal);
924         selfHealing = false;
925         return(retVal);
926 }
927
928 void AliHLTTPCCAGPUTrackerNVCC::ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId)
929 {
930         fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
931 #ifdef HLTCA_GPU_TIME_PROFILE
932         unsigned long long int a, b;
933         AliHLTTPCCATracker::StandaloneQueryTime(&a);
934 #endif
935         fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
936 #ifdef HLTCA_GPU_TIME_PROFILE
937         AliHLTTPCCATracker::StandaloneQueryTime(&b);
938         printf("Read %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
939 #endif
940 }
941
942 void AliHLTTPCCAGPUTrackerNVCC::WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId)
943 {
944         if (fDebugLevel >= 3) printf("GPU Tracker running WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);
945         fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
946 #ifdef HLTCA_GPU_TIME_PROFILE
947         unsigned long long int a, b;
948         AliHLTTPCCATracker::StandaloneQueryTime(&a);
949 #endif
950         if (fNHelperThreads) pthread_mutex_lock((pthread_mutex_t*) fHelperMemMutex);
951         fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare();
952         if (fNHelperThreads) pthread_mutex_unlock((pthread_mutex_t*) fHelperMemMutex);
953         fSlaveTrackers[firstSlice + iSlice].WriteOutput();
954 #ifdef HLTCA_GPU_TIME_PROFILE
955         AliHLTTPCCATracker::StandaloneQueryTime(&b);
956         printf("Write %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC);
957 #endif
958         if (fDebugLevel >= 3) printf("GPU Tracker finished WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);
959 }
960
961 int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
962 {
963         //Primary reconstruction function
964
965         cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
966
967         if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
968
969         if (!fCudaInitialized)
970         {
971                 HLTError("GPUTracker not initialized");
972                 return(1);
973         }
974         if (sliceCountLocal > fSliceCount)
975         {
976                 HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
977                 return(1);
978         }
979         if (fThreadId != GetThread())
980         {
981                 HLTWarning("CUDA thread changed, migrating context, Previous Thread: %d, New Thread: %d", fThreadId, GetThread());
982                 fThreadId = GetThread();
983         }
984
985         if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
986
987         if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
988         {
989                 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
990                 return(1);
991         }
992         
993         cuCtxPushCurrent(*((CUcontext*) fCudaContext));
994         if (fPPMode)
995         {
996                 int retVal = ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal);
997                 cuCtxPopCurrent((CUcontext*) fCudaContext);
998                 return(retVal);
999         }
1000
1001         for (int i = fNHelperThreads;i < fNCPUTrackers + fNHelperThreads;i++)
1002         {
1003                 fHelperParams[i].CPUTracker = 1;
1004                 fHelperParams[i].pClusterData = pClusterData;
1005                 fHelperParams[i].pOutput = pOutput;
1006                 fHelperParams[i].fSliceCount = sliceCountLocal;
1007                 fHelperParams[i].fFirstSlice = firstSlice;
1008                 pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
1009         }
1010         sliceCountLocal -= fNCPUTrackers * fNSlicesPerCPUTracker;
1011         if (sliceCountLocal < 0) sliceCountLocal = 0;
1012
1013         fUseGlobalTracking = fGlobalTracking && sliceCountLocal == fgkNSlices;
1014
1015         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
1016
1017         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
1018
1019 #ifdef HLTCA_GPU_TIME_PROFILE
1020         AliHLTTPCCATracker::StandaloneQueryFreq(&fProfTimeC);
1021         AliHLTTPCCATracker::StandaloneQueryTime(&fProfTimeD);
1022 #endif
1023
1024         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1025         {
1026                 //Make this a GPU Tracker
1027                 fGpuTracker[iSlice].SetGPUTracker();
1028                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
1029                 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
1030                 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
1031
1032                 //Set Pointers to GPU Memory
1033                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
1034
1035                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
1036                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
1037                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
1038
1039                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
1040                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
1041                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
1042
1043                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
1044                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1045                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
1046
1047                 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY RANDOM_ERROR)
1048                 {
1049                         HLTError("Insufficiant Track Memory");
1050                         cudaThreadSynchronize();
1051                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1052                         ResetHelperThreads(0);
1053                         return(1);
1054                 }
1055
1056                 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY RANDOM_ERROR)
1057                 {
1058                         HLTError("Insufficiant Global Memory");
1059                         cudaThreadSynchronize();
1060                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1061                         ResetHelperThreads(0);
1062                         return(1);
1063                 }
1064
1065                 if (fDebugLevel >= 3)
1066                 {
1067                         HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", (int) (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice)), HLTCA_GPU_GLOBAL_MEMORY, (int) fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY);
1068                 }
1069
1070                 //Initialize Startup Constants
1071                 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
1072                 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
1073                 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
1074                 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = sliceCountLocal > fConstructorBlockCount ? (iSlice < fConstructorBlockCount) : fConstructorBlockCount * (iSlice + 1) / sliceCountLocal - fConstructorBlockCount * (iSlice) / sliceCountLocal;
1075                 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
1076                 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
1077                 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
1078                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
1079                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fNextTracklet = (fConstructorBlockCount / sliceCountLocal + (fConstructorBlockCount % sliceCountLocal > iSlice)) * HLTCA_GPU_THREAD_COUNT;
1080                 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
1081         }
1082
1083 #ifdef HLTCA_GPU_TEXTURE_FETCH
1084         cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
1085         size_t offset;
1086         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
1087         {
1088                 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
1089                 cudaThreadSynchronize();
1090                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1091                 ResetHelperThreads(0);
1092                 return(1);
1093         }
1094         cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
1095         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
1096         {
1097                 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
1098                 cudaThreadSynchronize();
1099                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1100                 ResetHelperThreads(0);
1101                 return(1);
1102         }
1103         cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
1104         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR)
1105         {
1106                 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
1107                 cudaThreadSynchronize();
1108                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1109                 ResetHelperThreads(0);
1110                 return(1);
1111         }
1112 #endif
1113
1114         //Copy Tracker Object to GPU Memory
1115         if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
1116 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
1117         char* tmpMem;
1118         if (CudaFailedMsg(cudaMalloc(&tmpMem, 100000000)))
1119         {
1120                 HLTError("Error allocating CUDA profile memory");
1121                 cudaThreadSynchronize();
1122                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1123                 ResetHelperThreads(0);
1124                 return(1);
1125         }
1126         fGpuTracker[0].fStageAtSync = tmpMem;
1127         CudaFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));
1128 #endif
1129         CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
1130         if (CUDASync("Initialization (1)", 0, firstSlice) RANDOM_ERROR)
1131         {
1132                 cudaThreadSynchronize();
1133                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1134                 ResetHelperThreads(0);
1135                 return(1);
1136         }
1137
1138         for (int i = 0;i < fNHelperThreads;i++)
1139         {
1140                 fHelperParams[i].CPUTracker = 0;
1141                 fHelperParams[i].fDone = 0;
1142                 fHelperParams[i].fPhase = 0;
1143                 fHelperParams[i].pClusterData = pClusterData;
1144                 fHelperParams[i].fSliceCount = sliceCountLocal;
1145                 fHelperParams[i].fFirstSlice = firstSlice;
1146                 pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
1147         }
1148
1149         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1150         {
1151                 StandalonePerfTime(firstSlice + iSlice, 0);
1152
1153                 //Initialize GPU Slave Tracker
1154                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data (Slice %d)", iSlice);
1155                 if (iSlice % (fNHelperThreads + 1) == 0)
1156                 {
1157                         ReadEvent(pClusterData, firstSlice, iSlice, 0);
1158                 }
1159                 else
1160                 {
1161                         if (fDebugLevel >= 3) HLTInfo("Waiting for helper thread %d", iSlice % (fNHelperThreads + 1) - 1);
1162                         while(fHelperParams[iSlice % (fNHelperThreads + 1) - 1].fDone < iSlice);
1163                 }
1164
1165                 if (fDebugLevel >= 4)
1166                 {
1167 #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
1168                         *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl;
1169 #endif
1170                         if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);
1171                 }
1172
1173                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY RANDOM_ERROR)
1174                 {
1175                         HLTError("Insufficiant Slice Data Memory");
1176                         cudaThreadSynchronize();
1177                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1178                         ResetHelperThreads(1);
1179                         return(1);
1180                 }
1181
1182                 if (fDebugLevel >= 3)
1183                 {
1184                         HLTInfo("GPU Slice Data Memory Used: %d/%d", (int) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY);
1185                 }
1186
1187                 //Initialize temporary memory where needed
1188                 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");           
1189                 PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
1190                 if (CUDASync("Initialization (2)", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1191                 {
1192                         cudaThreadSynchronize();
1193                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1194                         ResetHelperThreads(1);
1195                         return(1);
1196                 }
1197
1198                 //Copy Data to GPU Global Memory
1199                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
1200                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
1201                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
1202
1203                 if (fDebugLevel >= 4)
1204                 {
1205                         if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
1206                         fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
1207                         fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
1208                 }
1209
1210                 if (CUDASync("Initialization (3)", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1211                 {
1212                         cudaThreadSynchronize();
1213                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1214                         ResetHelperThreads(1);
1215                         return(1);
1216                 }
1217                 StandalonePerfTime(firstSlice + iSlice, 1);
1218
1219                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);
1220                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), HLTCA_GPU_THREAD_COUNT_FINDER, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1221
1222                 if (CUDASync("Neighbours finder", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1223                 {
1224                         cudaThreadSynchronize();
1225                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1226                         ResetHelperThreads(1);
1227                         return(1);
1228                 }
1229
1230                 StandalonePerfTime(firstSlice + iSlice, 2);
1231
1232                 if (fDebugLevel >= 4)
1233                 {
1234                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
1235                         if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
1236                 }
1237
1238                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);
1239                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1240                 if (CUDASync("Neighbours Cleaner", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1241                 {
1242                         cudaThreadSynchronize();
1243                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1244                         ResetHelperThreads(1);
1245                         return(1);
1246                 }
1247
1248                 StandalonePerfTime(firstSlice + iSlice, 3);
1249
1250                 if (fDebugLevel >= 4)
1251                 {
1252                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
1253                         if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
1254                 }
1255
1256                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);
1257                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1258                 if (CUDASync("Start Hits Finder", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1259                 {
1260                         cudaThreadSynchronize();
1261                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1262                         ResetHelperThreads(1);
1263                         return(1);
1264                 }
1265
1266                 StandalonePerfTime(firstSlice + iSlice, 4);
1267
1268                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);
1269                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice & 1]>>>(iSlice);
1270                 if (CUDASync("Start Hits Sorter", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1271                 {
1272                         cudaThreadSynchronize();
1273                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1274                         ResetHelperThreads(1);
1275                         return(1);
1276                 }
1277
1278                 StandalonePerfTime(firstSlice + iSlice, 5);
1279
1280                 if (fDebugLevel >= 2)
1281                 {
1282                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
1283                         if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);
1284                         if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS RANDOM_ERROR)
1285                         {
1286                                 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
1287                                 cudaThreadSynchronize();
1288                                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1289                                 ResetHelperThreads(1);
1290                                 return(1);
1291                         }
1292                 }
1293
1294                 if (fDebugLevel >= 4 && *fSlaveTrackers[firstSlice + iSlice].NTracklets())
1295                 {
1296 #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
1297                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
1298                         if (fDebugMask & 8)
1299                         {
1300                                 *fOutFile << "Temporary ";
1301                                 fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
1302                         }
1303                         uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
1304                         CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
1305                         if (fDebugMask & 16)
1306                         {
1307                                 *fOutFile << "Start Hits Sort Vector:" << std::endl;
1308                                 for (int i = 1;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 5;i++)
1309                                 {
1310                                         *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
1311                                 }
1312                         }
1313                         free(tmpMemory);
1314 #endif
1315
1316                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
1317                         if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
1318                 }
1319
1320                 StandalonePerfTime(firstSlice + iSlice, 6);
1321
1322                 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1323         }
1324
1325         for (int i = 0;i < fNHelperThreads;i++)
1326         {
1327                 pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
1328         }
1329
1330         StandalonePerfTime(firstSlice, 7);
1331
1332 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
1333         int nHardCollisions = 0;
1334
1335 RestartTrackletConstructor:
1336         if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
1337         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1338         {
1339                 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
1340                 if (CUDASync("Tracklet Initializer", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1341                 {
1342                         cudaThreadSynchronize();
1343                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1344                         return(1);
1345                 }
1346                 if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice);
1347         }
1348 #endif
1349
1350         if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
1351         AliHLTTPCCATrackletConstructorGPU<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>();
1352         if (CUDASync("Tracklet Constructor", 0, firstSlice) RANDOM_ERROR)
1353         {
1354                 cudaThreadSynchronize();
1355                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1356                 return(1);
1357         }
1358
1359         StandalonePerfTime(firstSlice, 8);
1360
1361         if (fDebugLevel >= 4)
1362         {
1363                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1364                 {
1365                         if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
1366                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
1367                         if (fDebugLevel >= 5)
1368                         {
1369                                 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
1370                         }
1371                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
1372                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
1373                         if (0 && fSlaveTrackers[firstSlice + iSlice].NTracklets() && fSlaveTrackers[firstSlice + iSlice].Tracklet(0).NHits() < 0)
1374                         {
1375                                 cudaThreadSynchronize();
1376                                 cuCtxPopCurrent((CUcontext*) fCudaContext);
1377                                 printf("INTERNAL ERROR\n");
1378                                 return(1);
1379                         }
1380                         if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
1381                 }
1382         }
1383
1384         int runSlices = 0;
1385         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
1386         {
1387                 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
1388                 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
1389                 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
1390                 if (CUDASync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR)
1391                 {
1392                         cudaThreadSynchronize();
1393                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1394                         return(1);
1395                 }
1396         }
1397         StandalonePerfTime(firstSlice, 9);
1398
1399         char *tmpMemoryGlobalTracking = NULL;
1400         fSliceOutputReady = 0;
1401         if (fUseGlobalTracking)
1402         {
1403                 int tmpmemSize = sizeof(AliHLTTPCCATracklet)
1404 #ifdef EXTERN_ROW_HITS
1405                 + HLTCA_ROW_COUNT * sizeof(int)
1406 #endif
1407                 + 16;
1408                 tmpMemoryGlobalTracking = (char*) malloc(tmpmemSize * fgkNSlices);
1409                 for (int i = 0;i < fgkNSlices;i++)
1410                 {
1411                         fSliceLeftGlobalReady[i] = 0;
1412                         fSliceRightGlobalReady[i] = 0;
1413                 }
1414                 memset(fGlobalTrackingDone, 0, fgkNSlices);
1415                 memset(fWriteOutputDone, 0, fgkNSlices);
1416
1417                 for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
1418                 {
1419                         fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(tmpMemoryGlobalTracking + (tmpmemSize * iSlice), 1, fConstructorBlockCount);
1420                 }
1421         }
1422         for (int i = 0;i < fNHelperThreads;i++)
1423         {
1424                 fHelperParams[i].fPhase = 1;
1425                 fHelperParams[i].pOutput = pOutput;
1426                 pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]);
1427         }
1428
1429         int tmpSlice = 0, tmpSlice2 = 0;
1430         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1431         {
1432                 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
1433
1434                 while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == (cudaError_t) CUDA_SUCCESS))
1435                 {
1436                         if (CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice])) RANDOM_ERROR)
1437                         {
1438                                 ResetHelperThreads(1);
1439                                 cudaThreadSynchronize();
1440                                 return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
1441                         }
1442                         tmpSlice++;
1443                 }
1444
1445                 while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == (cudaError_t) CUDA_SUCCESS)
1446                 {
1447                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
1448                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));
1449                         tmpSlice2++;
1450                 }
1451
1452                 if (CudaFailedMsg(cudaStreamSynchronize(cudaStreams[iSlice])) RANDOM_ERROR)
1453                 {
1454                         ResetHelperThreads(1);
1455                         cudaThreadSynchronize();
1456                         return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal));
1457                 }
1458
1459                 if (fDebugLevel >= 4)
1460                 {
1461                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
1462 #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT
1463                         if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
1464 #endif
1465                         if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
1466                 }
1467
1468                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError RANDOM_ERROR)
1469                 {
1470 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
1471                         if ((fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION || fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_WRONG_ROW)&& nHardCollisions++ < 10)
1472                         {
1473                                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION)
1474                                 {
1475                                         HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor (Slice %d)", firstSlice + iSlice);
1476                                 }
1477                                 else
1478                                 {
1479                                         HLTWarning("Tracklet Constructor returned invalid row (Slice %d)", firstSlice + iSlice);
1480                                 }
1481                                 if (fDebugLevel >= 4)
1482                                 {
1483                                         ResetHelperThreads(1);
1484                                         cudaThreadSynchronize();
1485                                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1486                                         return(1);
1487                                 }
1488                                 for (int i = 0;i < sliceCountLocal;i++)
1489                                 {
1490                                         cudaThreadSynchronize();
1491                                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
1492                                         *fSlaveTrackers[firstSlice + i].NTracks() = 0;
1493                                         *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
1494                                         fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
1495                                         CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
1496                                         PreInitRowBlocks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
1497                                 }
1498                                 goto RestartTrackletConstructor;
1499                         }
1500 #endif
1501                         HLTError("GPU Tracker returned Error Code %d in slice %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError, firstSlice + iSlice);
1502                         cudaThreadSynchronize();
1503                         cuCtxPopCurrent((CUcontext*) fCudaContext);
1504                         ResetHelperThreads(1);
1505                         return(1);
1506                 }
1507                 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
1508
1509                 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks;
1510                 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits;
1511                 if (fUseGlobalTracking) fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracklets = 1;
1512
1513                 if (fDebugLevel >= 3) HLTInfo("Data ready for slice %d, helper thread %d", iSlice, iSlice % (fNHelperThreads + 1));
1514                 fSliceOutputReady = iSlice;
1515
1516                 if (fUseGlobalTracking)
1517                 {
1518                         if (iSlice % (fgkNSlices / 2) == 2)
1519                         {
1520                                 int tmpId = iSlice % (fgkNSlices / 2) - 1;
1521                                 if (iSlice >= fgkNSlices / 2) tmpId += fgkNSlices / 2;
1522                                 GlobalTracking(tmpId, 0, NULL);
1523                                 fGlobalTrackingDone[tmpId] = 1;
1524                         }
1525                         for (int tmpSlice3a = 0;tmpSlice3a < iSlice;tmpSlice3a += fNHelperThreads + 1)
1526                         {
1527                                 int tmpSlice3 = tmpSlice3a + 1;
1528                                 if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
1529                                 if (tmpSlice3 >= iSlice) break;
1530
1531                                 int sliceLeft = (tmpSlice3 + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
1532                                 int sliceRight = (tmpSlice3 + 1) % (fgkNSlices / 2);
1533                                 if (tmpSlice3 >= fgkNSlices / 2)
1534                                 {
1535                                         sliceLeft += fgkNSlices / 2;
1536                                         sliceRight += fgkNSlices / 2;
1537                                 }
1538
1539                                 if (tmpSlice3 % (fgkNSlices / 2) != 1 && fGlobalTrackingDone[tmpSlice3] == 0 && sliceLeft < iSlice && sliceRight < iSlice)
1540                                 {
1541                                         GlobalTracking(tmpSlice3, 0, NULL);
1542                                         fGlobalTrackingDone[tmpSlice3] = 1;
1543                                 }
1544
1545                                 if (fWriteOutputDone[tmpSlice3] == 0 && fSliceLeftGlobalReady[tmpSlice3] && fSliceRightGlobalReady[tmpSlice3])
1546                                 {
1547                                         WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
1548                                         fWriteOutputDone[tmpSlice3] = 1;
1549                                 }
1550                         }
1551                 }
1552                 else
1553                 {
1554                         if (iSlice % (fNHelperThreads + 1) == 0)
1555                         {
1556                                 WriteOutput(pOutput, firstSlice, iSlice, 0);
1557                         }
1558                 }
1559
1560                 if (fDebugLevel >= 4)
1561                 {
1562                         delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
1563                         delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
1564                 }
1565         }
1566
1567         if (fUseGlobalTracking)
1568         {
1569                 for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
1570                 {
1571                         int tmpSlice3 = (tmpSlice3a + 1);
1572                         if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
1573                         if (fGlobalTrackingDone[tmpSlice3] == 0) GlobalTracking(tmpSlice3, 0, NULL);
1574                 }
1575                 for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1)
1576                 {
1577                         int tmpSlice3 = (tmpSlice3a + 1);
1578                         if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2);
1579                         if (fWriteOutputDone[tmpSlice3] == 0)
1580                         {
1581                                 while (fSliceLeftGlobalReady[tmpSlice3] == 0 || fSliceRightGlobalReady[tmpSlice3] == 0);
1582                                 WriteOutput(pOutput, firstSlice, tmpSlice3, 0);
1583                         }
1584                 }
1585         }
1586
1587         for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
1588         {
1589                 pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
1590         }
1591
1592         if (fUseGlobalTracking)
1593         {
1594                 free(tmpMemoryGlobalTracking);
1595                 if (fDebugLevel >= 3)
1596                 {
1597                         for (int iSlice = 0;iSlice < fgkNSlices;iSlice++)
1598                         {
1599                                 printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits);
1600                         }
1601                 }
1602         }
1603
1604         StandalonePerfTime(firstSlice, 10);
1605
1606         if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
1607
1608         /*for (int i = firstSlice;i < firstSlice + sliceCountLocal;i++)
1609         {
1610                 fSlaveTrackers[i].DumpOutput(stdout);
1611         }*/
1612
1613         /*static int runnum = 0;
1614         std::ofstream tmpOut;
1615         char buffer[1024];
1616         sprintf(buffer, "GPUtracks%d.out", runnum++);
1617         tmpOut.open(buffer);
1618         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1619         {
1620                 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(tmpOut);
1621         }
1622         tmpOut.close();*/
1623
1624 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
1625         char* stageAtSync = (char*) malloc(100000000);
1626         CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
1627         cudaFree(fGpuTracker[0].StageAtSync());
1628
1629         FILE* fp = fopen("profile.txt", "w+");
1630         FILE* fp2 = fopen("profile.bmp", "w+b");
1631         int nEmptySync = 0, fEmpty;
1632
1633         const int bmpheight = 8192;
1634         BITMAPFILEHEADER bmpFH;
1635         BITMAPINFOHEADER bmpIH;
1636         ZeroMemory(&bmpFH, sizeof(bmpFH));
1637         ZeroMemory(&bmpIH, sizeof(bmpIH));
1638
1639         bmpFH.bfType = 19778; //"BM"
1640         bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1) * bmpheight ;
1641         bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
1642
1643         bmpIH.biSize = sizeof(bmpIH);
1644         bmpIH.biWidth = fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / 32 * 33 - 1;
1645         bmpIH.biHeight = bmpheight;
1646         bmpIH.biPlanes = 1;
1647         bmpIH.biBitCount = 32;
1648
1649         fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
1650         fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);  
1651
1652         for (int i = 0;i < bmpheight * fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;i += fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR)
1653         {
1654                 fEmpty = 1;
1655                 for (int j = 0;j < fConstructorBlockCount * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;j++)
1656                 {
1657                         fprintf(fp, "%d\t", stageAtSync[i + j]);
1658                         int color = 0;
1659                         if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
1660                         if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
1661                         if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
1662                         if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
1663                         fwrite(&color, 1, sizeof(int), fp2);
1664                         if (j > 0 && j % 32 == 0)
1665                         {
1666                                 color = RGB(255, 255, 255);
1667                                 fwrite(&color, 1, 4, fp2);
1668                         }
1669                         if (stageAtSync[i + j]) fEmpty = 0;
1670                 }
1671                 fprintf(fp, "\n");
1672                 if (fEmpty) nEmptySync++;
1673                 else nEmptySync = 0;
1674                 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
1675         }
1676
1677         fclose(fp);
1678         fclose(fp2);
1679         free(stageAtSync);
1680 #endif 
1681
1682         cuCtxPopCurrent((CUcontext*) fCudaContext);
1683         return(0);
1684 }
1685
1686 __global__ void ClearPPHitWeights(int sliceCount)
1687 {
1688         //Clear HitWeights
1689
1690         for (int k = 0;k < sliceCount;k++)
1691         {
1692                 AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k];
1693                 int4* const pHitWeights = (int4*) tracker.Data().HitWeights();
1694                 const int dwCount = tracker.Data().NumberOfHitsPlusAlign();
1695                 const int stride = blockDim.x * gridDim.x;
1696                 int4 i0;
1697                 i0.x = i0.y = i0.z = i0.w = 0;
1698
1699                 for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < dwCount * sizeof(int) / sizeof(int4);i += stride)
1700                 {
1701                         pHitWeights[i] = i0;
1702                 }
1703         }
1704 }
1705
1706 int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
1707 {
1708         //Primary reconstruction function for small events (PP)
1709
1710         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
1711
1712         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
1713
1714         char* tmpSliceMemHost = (char*) SliceDataMemory(fHostLockedMemory, 0);
1715         char* tmpSliceMemGpu = (char*) SliceDataMemory(fGPUMemory, 0);
1716
1717         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1718         {
1719                 StandalonePerfTime(firstSlice + iSlice, 0);
1720
1721                 //Initialize GPU Slave Tracker
1722                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
1723                 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(tmpSliceMemHost, RowMemory(fHostLockedMemory, firstSlice + iSlice));
1724                 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
1725                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
1726                 {
1727                         HLTError("Insufficiant Slice Data Memory");
1728                         return(1);
1729                 }
1730
1731                 //Make this a GPU Tracker
1732                 fGpuTracker[iSlice].SetGPUTracker();
1733                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
1734
1735
1736                 fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice));
1737                 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
1738
1739                 tmpSliceMemHost += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
1740                 tmpSliceMemHost = alignPointer(tmpSliceMemHost, 64 * 1024);
1741                 tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();
1742                 tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024);
1743
1744
1745                 //Set Pointers to GPU Memory
1746                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
1747
1748                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
1749                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
1750                 tmpMem = alignPointer(tmpMem, 64 * 1024);
1751
1752                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
1753                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount);
1754                 tmpMem = alignPointer(tmpMem, 64 * 1024);
1755
1756                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
1757                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1758                 tmpMem = alignPointer(tmpMem, 64 * 1024);
1759
1760                 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
1761                 {
1762                         HLTError("Insufficiant Track Memory");
1763                         return(1);
1764                 }
1765
1766                 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
1767                 {
1768                         HLTError("Insufficiant Global Memory");
1769                         return(1);
1770                 }
1771
1772                 //Initialize Startup Constants
1773                 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
1774                 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
1775                 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
1776                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
1777
1778                 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
1779
1780                 if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);
1781                 StandalonePerfTime(firstSlice + iSlice, 1);
1782         }
1783
1784 #ifdef HLTCA_GPU_TEXTURE_FETCH
1785         cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
1786         size_t offset;
1787         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
1788         {
1789                 HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
1790                 return(1);
1791         }
1792         cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
1793         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
1794         {
1795                 HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
1796                 return(1);
1797         }
1798         cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
1799         if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
1800         {
1801                 HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
1802                 return(1);
1803         }
1804 #endif
1805
1806         //Copy Tracker Object to GPU Memory
1807         if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
1808         CudaFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));
1809
1810         //Copy Data to GPU Global Memory
1811         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1812         {
1813                 CudaFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));
1814                 //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize());
1815         }
1816         //CudaFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));
1817         //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0)));
1818         CudaFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));
1819         CudaFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));
1820
1821         if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
1822         AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_FINDER>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows());
1823         if (CUDASync("Neighbours finder", 0, firstSlice)) return 1;
1824         StandalonePerfTime(firstSlice, 2);
1825         if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
1826         AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2);
1827         if (CUDASync("Neighbours Cleaner", 0, firstSlice)) return 1;
1828         StandalonePerfTime(firstSlice, 3);
1829         if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
1830         AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6);
1831         if (CUDASync("Start Hits Finder", 0, firstSlice)) return 1;
1832         StandalonePerfTime(firstSlice, 4);
1833
1834         ClearPPHitWeights <<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(sliceCountLocal);
1835         if (CUDASync("Clear Hit Weights", 0, firstSlice)) return 1;
1836
1837         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1838         {
1839                 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
1840         }
1841
1842         StandalonePerfTime(firstSlice, 7);
1843
1844         if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
1845         AliHLTTPCCATrackletConstructorGPUPP<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR>>>(0, sliceCountLocal);
1846         if (CUDASync("Tracklet Constructor PP", 0, firstSlice)) return 1;
1847
1848         StandalonePerfTime(firstSlice, 8);
1849
1850         AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<selectorBlockCount, HLTCA_GPU_THREAD_COUNT_SELECTOR>>>(0, sliceCountLocal);
1851         if (CUDASync("Tracklet Selector", 0, firstSlice)) return 1;
1852         StandalonePerfTime(firstSlice, 9);
1853
1854         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));
1855
1856         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1857         {
1858                 if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
1859
1860                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));
1861                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));
1862
1863                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
1864                 {
1865                         HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
1866                         return(1);
1867                 }
1868                 if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
1869                 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks;
1870                 fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits;
1871         }
1872
1873         if (fGlobalTracking && sliceCountLocal == fgkNSlices)
1874         {
1875                 char tmpMemory[sizeof(AliHLTTPCCATracklet)
1876 #ifdef EXTERN_ROW_HITS
1877                 + HLTCA_ROW_COUNT * sizeof(int)
1878 #endif
1879                 + 16];
1880
1881                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1882                 {
1883                         if (fSlaveTrackers[iSlice].CommonMemory()->fNTracklets)
1884                         {
1885                                 HLTError("Slave tracker tracklets found where none expected, memory not freed!\n");
1886                         }
1887                         fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(&tmpMemory[0], 1, fConstructorBlockCount);
1888                         fSlaveTrackers[iSlice].CommonMemory()->fNTracklets = 1;
1889                 }
1890
1891                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1892                 {
1893                         int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2);
1894                         int sliceRight = (iSlice + 1) % (fgkNSlices / 2);
1895                         if (iSlice >= fgkNSlices / 2)
1896                         {
1897                                 sliceLeft += fgkNSlices / 2;
1898                                 sliceRight += fgkNSlices / 2;
1899                         }
1900                         fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS);              
1901                 }
1902
1903                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1904                 {
1905                         printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits);
1906                 }
1907         }
1908
1909         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
1910         {
1911                 fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
1912                 fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare();
1913                 fSlaveTrackers[firstSlice + iSlice].WriteOutput();
1914         }
1915
1916         StandalonePerfTime(firstSlice, 10);
1917
1918         if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
1919
1920         return(0);
1921 }
1922
1923 int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
1924 {
1925         //Initialize Slice Tracker Parameter for a slave tracker
1926         fSlaveTrackers[iSlice].Initialize(param);
1927         if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
1928         {
1929                 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
1930                 return(1);
1931         }
1932         return(0);
1933 }
1934
1935 int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
1936 {
1937         //Uninitialize CUDA
1938         cuCtxPushCurrent(*((CUcontext*) fCudaContext));
1939
1940         cudaThreadSynchronize();
1941         if (fGPUMemory)
1942         {
1943                 cudaFree(fGPUMemory);
1944                 fGPUMemory = NULL;
1945         }
1946         if (fHostLockedMemory)
1947         {
1948                 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
1949                 {
1950                         cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
1951                 }
1952                 free(fpCudaStreams);
1953                 fGpuTracker = NULL;
1954                 cudaFreeHost(fHostLockedMemory);
1955         }
1956
1957         if (CudaFailedMsg(cudaThreadExit()))
1958         {
1959                 HLTError("Could not uninitialize GPU");
1960                 return(1);
1961         }
1962
1963         if (StopHelperThreads()) return(1);
1964         pthread_mutex_destroy((pthread_mutex_t*) fHelperMemMutex);
1965         free(fHelperMemMutex);
1966
1967         for (int i = 0;i < fgkNSlices;i++) pthread_mutex_destroy(&((pthread_mutex_t*) fSliceGlobalMutexes)[i]);
1968         free(fSliceGlobalMutexes);
1969
1970         cuCtxDestroy(*((CUcontext*) fCudaContext));
1971
1972         cudaDeviceReset();
1973
1974         HLTInfo("CUDA Uninitialized");
1975         fCudaInitialized = 0;
1976         return(0);
1977 }
1978
1979 void AliHLTTPCCAGPUTrackerNVCC::ResetHelperThreads(int helpers)
1980 {
1981         HLTImportant("Error occurred, GPU tracker helper threads will be reset (Number of threads %d/%d)", fNHelperThreads, fNCPUTrackers);
1982         for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++)
1983         {
1984                 fHelperParams[i].fReset = true;
1985                 if (helpers || i >= fNHelperThreads) pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]);
1986         }
1987         HLTImportant("GPU Tracker helper threads have ben reset");
1988 }
1989
1990 int AliHLTTPCCAGPUTrackerNVCC::StopHelperThreads()
1991 {
1992         if (fNSlaveThreads)
1993         {
1994                 for (int i = 0;i < fNSlaveThreads;i++)
1995                 {
1996                         fHelperParams[i].fTerminate = true;
1997                         if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]))
1998                         {
1999                                 HLTError("Error unlocking mutex to terminate slave");
2000                                 return(1);
2001                         }
2002                         if (pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]))
2003                         {
2004                                 HLTError("Error locking mutex");
2005                                 return(1);
2006                         }
2007                         if (pthread_join( *((pthread_t*) fHelperParams[i].fThreadId), NULL))
2008                         {
2009                                 HLTError("Error waiting for thread to terminate");
2010                                 return(1);
2011                         }
2012                         free(fHelperParams[i].fThreadId);
2013                         for (int j = 0;j < 2;j++)
2014                         {
2015                                 if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]))
2016                                 {
2017                                         HLTError("Error unlocking mutex before destroying");
2018                                         return(1);
2019                                 }
2020                                 pthread_mutex_destroy(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]);
2021                         }
2022                         free(fHelperParams[i].fMutex);
2023                 }
2024                 delete[] fHelperParams;
2025         }
2026         fNSlaveThreads = 0;
2027         return(0);
2028 }
2029
2030 void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
2031 {
2032         //Set Output Control Pointers
2033         fOutputControl = val;
2034         for (int i = 0;i < fgkNSlices;i++)
2035         {
2036                 fSlaveTrackers[i].SetOutputControl(val);
2037         }
2038 }
2039
2040 int AliHLTTPCCAGPUTrackerNVCC::GetThread()
2041 {
2042         //Get Thread ID
2043 #ifdef R__WIN32
2044         return((int) (size_t) GetCurrentThread());
2045 #else
2046         return((int) syscall (SYS_gettid));
2047 #endif
2048 }
2049
2050 unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)
2051 {
2052         //Returns pointer to PerfTimer i of slice iSlice
2053         return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);
2054 }
2055
2056 const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const
2057 {
2058         //Return Pointer to Output Control Structure
2059         return fOutputControl;
2060 }
2061
2062 int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const
2063 {
2064         //Return max slice count processable
2065         return(fSliceCount);
2066 }
2067
2068 char* AliHLTTPCCAGPUTrackerNVCC::MergerBaseMemory()
2069 {
2070         return(alignPointer((char*) fGPUMergerHostMemory, 1024 * 1024));
2071 }
2072
2073 int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger)
2074 {
2075 #ifndef HLTCA_GPU_MERGER
2076         HLTError("HLTCA_GPU_MERGER compile flag not set");
2077         return(1);
2078 #else
2079         if (!fCudaInitialized)
2080         {
2081                 HLTError("GPU Merger not initialized");
2082                 return(1);
2083         }
2084         unsigned long long int a, b, c, d, e;
2085         AliHLTTPCCATracker::StandaloneQueryFreq(&e);
2086
2087         char* gpumem = (char*) fGPUMergerMemory;
2088         float *X, *Y, *Z, *Angle;
2089         unsigned int *RowType;
2090         AliHLTTPCGMMergedTrack* tracks;
2091         float* field;
2092         AliHLTTPCCAParam* param;
2093
2094         gpumem = alignPointer(gpumem, 1024 * 1024);
2095
2096         AssignMemory(X, gpumem, Merger->NClusters());
2097         AssignMemory(Y, gpumem, Merger->NClusters());
2098         AssignMemory(Z, gpumem, Merger->NClusters());
2099         AssignMemory(Angle, gpumem, Merger->NClusters());
2100         AssignMemory(RowType, gpumem, Merger->NClusters());
2101         AssignMemory(tracks, gpumem, Merger->NOutputTracks());
2102         AssignMemory(field, gpumem, 6);
2103         AssignMemory(param, gpumem, 1);
2104
2105
2106         if ((size_t) (gpumem - (char*) fGPUMergerMemory) > (size_t) fGPUMergerMaxMemory)
2107         {
2108                 HLTError("Insufficiant GPU Merger Memory");
2109         }
2110
2111         cuCtxPushCurrent(*((CUcontext*) fCudaContext));
2112
2113         if (fDebugLevel >= 2) HLTInfo("Running GPU Merger (%d/%d)", Merger->NOutputTrackClusters(), Merger->NClusters());
2114         AliHLTTPCCATracker::StandaloneQueryTime(&a);
2115         CudaFailedMsg(cudaMemcpy(X, Merger->ClusterX(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2116         CudaFailedMsg(cudaMemcpy(Y, Merger->ClusterY(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2117         CudaFailedMsg(cudaMemcpy(Z, Merger->ClusterZ(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2118         CudaFailedMsg(cudaMemcpy(Angle, Merger->ClusterAngle(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice));
2119         CudaFailedMsg(cudaMemcpy(RowType, Merger->ClusterRowType(), Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyHostToDevice));
2120         CudaFailedMsg(cudaMemcpy(tracks, Merger->OutputTracks(), Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyHostToDevice));
2121         CudaFailedMsg(cudaMemcpy(field, Merger->PolinomialFieldBz(), 6 * sizeof(float), cudaMemcpyHostToDevice));
2122         CudaFailedMsg(cudaMemcpy(param, fSlaveTrackers[0].pParam(), sizeof(AliHLTTPCCAParam), cudaMemcpyHostToDevice));
2123         AliHLTTPCCATracker::StandaloneQueryTime(&b);
2124         RefitTracks<<<fConstructorBlockCount, HLTCA_GPU_THREAD_COUNT>>>(tracks, Merger->NOutputTracks(), field, X, Y, Z, RowType, Angle, param);
2125         CudaFailedMsg(cudaThreadSynchronize());
2126         AliHLTTPCCATracker::StandaloneQueryTime(&c);
2127         CudaFailedMsg(cudaMemcpy(Merger->ClusterX(), X, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2128         CudaFailedMsg(cudaMemcpy(Merger->ClusterY(), Y, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2129         CudaFailedMsg(cudaMemcpy(Merger->ClusterZ(), Z, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2130         CudaFailedMsg(cudaMemcpy(Merger->ClusterAngle(), Angle, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost));
2131         CudaFailedMsg(cudaMemcpy(Merger->ClusterRowType(), RowType, Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyDeviceToHost));
2132         CudaFailedMsg(cudaMemcpy((void*) Merger->OutputTracks(), tracks, Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyDeviceToHost));
2133         CudaFailedMsg(cudaThreadSynchronize());
2134         AliHLTTPCCATracker::StandaloneQueryTime(&d);
2135         if (fDebugLevel >= 2) HLTInfo("GPU Merger Finished");
2136
2137         if (fDebugLevel > 0)
2138         {
2139                 int copysize = 4 * Merger->NOutputTrackClusters() * sizeof(float) + Merger->NOutputTrackClusters() * sizeof(unsigned int) + Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack) + 6 * sizeof(float) + sizeof(AliHLTTPCCAParam);
2140                 double speed = (double) copysize * (double) e / (double) (b - a) / 1e9;
2141                 printf("GPU Fit:\tCopy To:\t%lld us (%lf GB/s)\n", (b - a) * 1000000 / e, speed);
2142                 printf("\t\tFit:\t%lld us\n", (c - b) * 1000000 / e);
2143                 speed = (double) copysize * (double) e / (double) (d - c) / 1e9;
2144                 printf("\t\tCopy From:\t%lld us (%lf GB/s)\n", (d - c) * 1000000 / e, speed);
2145         }
2146
2147         cuCtxPopCurrent((CUcontext*) fCudaContext);
2148         return(0);
2149 #endif
2150 }
2151
2152 int AliHLTTPCCAGPUTrackerNVCC::IsInitialized()
2153 {
2154         return(fCudaInitialized);
2155 }
2156
2157 AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
2158 {
2159         return new AliHLTTPCCAGPUTrackerNVCC;
2160 }
2161
2162 void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)
2163 {
2164         delete ptr;
2165 }
2166