HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu

   1 // **************************************************************************
   2 // This file is property of and copyright by the ALICE HLT Project          *
   3 // ALICE Experiment at CERN, All rights reserved.                           *
   4 //                                                                          *
   5 // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
   6 //                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
   7 //                                      David Rohr <drohr@kip.uni-heidelberg.de>                                *
   8 //                  for The ALICE HLT Project.                              *
   9 //                                                                          *
  10 // Permission to use, copy, modify and distribute this software and its     *
  11 // documentation strictly for non-commercial purposes is hereby granted     *
  12 // without fee, provided that the above copyright notice appears in all     *
  13 // copies and that both the copyright notice and this permission notice     *
  14 // appear in the supporting documentation. The authors make no claims       *
  15 // about the suitability of this software for any purpose. It is            *
  16 // provided "as is" without express or implied warranty.                    *
  17 //                                                                          *
  18 //***************************************************************************
  19
  20 #include "AliHLTTPCCAGPUTrackerNVCC.h"
  21
  22 #ifdef HLTCA_GPUCODE
  23 #include <cuda.h>
  24 #include <sm_11_atomic_functions.h>
  25 #include <sm_12_atomic_functions.h>
  26 #endif
  27
  28 #ifdef R__WIN32
  29 #else
  30 #include <sys/syscall.h>
  31 #include <semaphore.h>
  32 #include <fcntl.h>
  33 #endif
  34
  35 #include "AliHLTTPCCADef.h"
  36 #include "AliHLTTPCCAGPUConfig.h"
  37
  38
  39 #include <iostream>
  40
  41 //Disable assertions since they produce errors in GPU Code
  42 #ifdef assert
  43 #undef assert
  44 #endif
  45 #define assert(param)
  46
  47 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
  48 #ifdef HLTCA_GPU_TEXTURE_FETCH
  49 texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
  50 texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
  51 texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
  52 #endif
  53
  54 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
  55 #include "AliHLTTPCCATrackParam.cxx"
  56 #include "AliHLTTPCCATrack.cxx"
  57
  58 #include "AliHLTTPCCAHitArea.cxx"
  59 #include "AliHLTTPCCAGrid.cxx"
  60 #include "AliHLTTPCCARow.cxx"
  61 #include "AliHLTTPCCAParam.cxx"
  62 #include "AliHLTTPCCATracker.cxx"
  63
  64 #include "AliHLTTPCCAProcess.h"
  65
  66 #include "AliHLTTPCCATrackletSelector.cxx"
  67 #include "AliHLTTPCCANeighboursFinder.cxx"
  68 #include "AliHLTTPCCANeighboursCleaner.cxx"
  69 #include "AliHLTTPCCAStartHitsFinder.cxx"
  70 #include "AliHLTTPCCAStartHitsSorter.cxx"
  71 #include "AliHLTTPCCATrackletConstructor.cxx"
  72 #include "AliHLTTPCCASliceOutput.cxx"
  73
  74 #include "MemoryAssignmentHelpers.h"
  75
  76 #ifndef HLTCA_STANDALONE
  77 #include "AliHLTDefinitions.h"
  78 #include "AliHLTSystem.h"
  79 #endif
  80
  81 ClassImp( AliHLTTPCCAGPUTrackerNVCC )
  82
  83 bool AliHLTTPCCAGPUTrackerNVCC::fgGPUUsed = false;
  84
  85 #define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
  86
  87 AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :
  88         fGpuTracker(NULL),
  89         fGPUMemory(NULL),
  90         fHostLockedMemory(NULL),
  91         fDebugLevel(0),
  92         fOutFile(NULL),
  93         fGPUMemSize(0),
  94         fpCudaStreams(NULL),
  95         fSliceCount(0),
  96         fOutputControl(NULL),
  97         fThreadId(0),
  98         fCudaInitialized(0)
  99         {};
 100
 101 AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() {};
 102
 103 void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)
 104 {
 105         //Release the global named semaphore that locks GPU Initialization
 106 #ifdef R__WIN32
 107         HANDLE* h = (HANDLE*) sem;
 108         ReleaseSemaphore(*h, 1, NULL);
 109         CloseHandle(*h);
 110         delete h;
 111 #else
 112         sem_t* pSem = (sem_t*) sem;
 113         sem_post(pSem);
 114         sem_unlink(SemLockName);
 115 #endif
 116 }
 117
 118 int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)
 119 {
 120         //Check constants for correct memory sizes
 121   if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
 122   {
 123           HLTError("Insufficiant Tracker Object Memory");
 124           return(1);
 125   }
 126
 127   if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
 128   {
 129           HLTError("Insufficiant Common Memory");
 130           return(1);
 131   }
 132
 133   if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
 134   {
 135           HLTError("Insufficiant Row Memory");
 136           return(1);
 137   }
 138   return(0);
 139 }
 140
 141 int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
 142 {
 143         //Find best CUDA device, initialize and allocate memory
 144
 145         if (CheckMemorySizes(sliceCount)) return(1);
 146
 147 #ifdef R__WIN32
 148         HANDLE* semLock = new HANDLE;
 149         *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
 150         if (*semLock == NULL)
 151         {
 152                 HLTError("Error creating GPUInit Semaphore");
 153                 return(1);
 154         }
 155         WaitForSingleObject(*semLock, INFINITE);
 156 #else
 157         sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
 158         if (semLock == SEM_FAILED)
 159         {
 160                 HLTError("Error creating GPUInit Semaphore");
 161                 return(1);
 162         }
 163         sem_wait(semLock);
 164 #endif
 165
 166         if (fgGPUUsed)
 167         {
 168             HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
 169                 ReleaseGlobalLock(semLock);
 170             return(1);
 171         }
 172         fgGPUUsed = 1;
 173         fThreadId = GetThread();
 174
 175         cudaDeviceProp fCudaDeviceProp;
 176
 177         fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
 178
 179 #ifndef CUDA_DEVICE_EMULATION
 180         int count, bestDevice = -1;
 181         long long int bestDeviceSpeed = 0, deviceSpeed;
 182         if (CudaFailedMsg(cudaGetDeviceCount(&count)))
 183         {
 184                 HLTError("Error getting CUDA Device Count");
 185                 fgGPUUsed = 0;
 186                 ReleaseGlobalLock(semLock);
 187                 return(1);
 188         }
 189         if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
 190         for (int i = 0;i < count;i++)
 191         {
 192                 unsigned int free, total;
 193                 cuInit(0);
 194                 CUdevice tmpDevice;
 195                 cuDeviceGet(&tmpDevice, i);
 196                 CUcontext tmpContext;
 197                 cuCtxCreate(&tmpContext, 0, tmpDevice);
 198                 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
 199                 cuCtxDestroy(tmpContext);
 200                 CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i));
 201
 202                 int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;
 203
 204                 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
 205                 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;
 206                 if (deviceOK && deviceSpeed > bestDeviceSpeed)
 207                 {
 208                         bestDevice = i;
 209                         bestDeviceSpeed = deviceSpeed;
 210                 }
 211         }
 212         if (bestDevice == -1)
 213         {
 214                 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
 215                 fgGPUUsed = 0;
 216                 ReleaseGlobalLock(semLock);
 217                 return(1);
 218         }
 219
 220   int cudaDevice;
 221   if (forceDeviceID == -1)
 222           cudaDevice = bestDevice;
 223   else
 224           cudaDevice = forceDeviceID;
 225 #else
 226         int cudaDevice = 0;
 227 #endif
 228
 229   cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice );
 230
 231   if (fDebugLevel >= 1)
 232   {
 233           HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
 234           HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);
 235           HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);
 236           HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
 237           HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
 238           HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);
 239           HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
 240           HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
 241           HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
 242           HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);
 243           HLTInfo("major = %d", fCudaDeviceProp.major);
 244           HLTInfo("minor = %d", fCudaDeviceProp.minor);
 245           HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);
 246           HLTInfo("textureAlignment %lld= ", (unsigned long long int) fCudaDeviceProp.textureAlignment);
 247   }
 248
 249   if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
 250   {
 251         HLTError( "Unsupported CUDA Device" );
 252         fgGPUUsed = 0;
 253         ReleaseGlobalLock(semLock);
 254         return(1);
 255   }
 256
 257   if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
 258   {
 259           HLTError("Could not set CUDA Device!");
 260           fgGPUUsed = 0;
 261           ReleaseGlobalLock(semLock);
 262           return(1);
 263   }
 264
 265   if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
 266   {
 267           HLTError("CUDA Memory Allocation Error");
 268           cudaThreadExit();
 269           fgGPUUsed = 0;
 270           ReleaseGlobalLock(semLock);
 271           return(1);
 272   }
 273   ReleaseGlobalLock(semLock);
 274   if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
 275   int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
 276   if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
 277   {
 278           cudaFree(fGPUMemory);
 279           cudaThreadExit();
 280           HLTError("Error allocating Page Locked Host Memory");
 281           fgGPUUsed = 0;
 282           return(1);
 283   }
 284   if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
 285
 286   if (fDebugLevel >= 1)
 287   {
 288           CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
 289   }
 290
 291   fSliceCount = sliceCount;
 292   //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
 293   fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
 294
 295   for (int i = 0;i < fgkNSlices;i++)
 296   {
 297     fSlaveTrackers[i].SetGPUTracker();
 298         fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
 299         fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
 300   }
 301
 302   fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
 303   cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 304   for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
 305   {
 306         if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
 307         {
 308             cudaFree(fGPUMemory);
 309             cudaFreeHost(fHostLockedMemory);
 310             cudaThreadExit();
 311             HLTError("Error creating CUDA Stream");
 312             fgGPUUsed = 0;
 313             return(1);
 314         }
 315   }
 316
 317   fCudaInitialized = 1;
 318   HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d)", cudaDevice, fCudaDeviceProp.name, fThreadId);
 319
 320 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
 321   if (fDebugLevel < 2)
 322   {
 323           //Do one initial run for Benchmark reasons
 324           const int useDebugLevel = fDebugLevel;
 325           fDebugLevel = 0;
 326           AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
 327
 328           std::ifstream fin;
 329
 330           AliHLTTPCCAParam tmpParam;
 331           AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
 332
 333           fin.open("events/settings.dump");
 334           int tmpCount;
 335           fin >> tmpCount;
 336           for (int i = 0;i < sliceCount;i++)
 337           {
 338                 fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
 339                 tmpParam.ReadSettings(fin);
 340                 InitializeSliceParam(i, tmpParam);
 341           }
 342           fin.close();
 343
 344           fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
 345           for (int i = 0;i < sliceCount;i++)
 346           {
 347                 tmpCluster[i].StartReading(i, 0);
 348                 tmpCluster[i].ReadEvent(fin);
 349                 tmpCluster[i].FinishReading();
 350           }
 351           fin.close();
 352
 353           AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
 354           memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
 355
 356           Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
 357           for (int i = 0;i < sliceCount;i++)
 358           {
 359                   free(tmpOutput[i]);
 360                   tmpOutput[i] = NULL;
 361                   fSlaveTrackers[i].SetOutputControl(NULL);
 362           }
 363           delete[] tmpOutput;
 364           delete[] tmpCluster;
 365           fDebugLevel = useDebugLevel;
 366   }
 367 #endif
 368   return(0);
 369 }
 370
 371 template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)
 372 {
 373         //Macro to align Pointers.
 374         //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
 375         //(As long as every single data structure is <= 1 MB)
 376
 377         size_t adr = (size_t) ptr;
 378         if (adr % alignment)
 379         {
 380                 adr += alignment - (adr % alignment);
 381         }
 382         return((T*) adr);
 383 }
 384
 385 bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsg(cudaError_t error)
 386 {
 387         //Check for CUDA Error and in the case of an error display the corresponding error string
 388         if (error == cudaSuccess) return(false);
 389         HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
 390         return(true);
 391 }
 392
 393 int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state)
 394 {
 395         //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
 396
 397         if (fDebugLevel == 0) return(0);
 398         cudaError cuErr;
 399         cuErr = cudaGetLastError();
 400         if (cuErr != cudaSuccess)
 401         {
 402                 HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state);
 403                 return(1);
 404         }
 405         if (CudaFailedMsg(cudaThreadSynchronize()))
 406         {
 407                 HLTError("CUDA Error while synchronizing (%s)", state);
 408                 return(1);
 409         }
 410         if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
 411         return(0);
 412 }
 413
 414 void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
 415 {
 416         //Set Debug Level and Debug output File if applicable
 417         fDebugLevel = dwLevel;
 418         if (NewOutFile) fOutFile = NewOutFile;
 419 }
 420
 421 int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
 422 {
 423         //Set a specific GPU Tracker Option
 424         {
 425                 HLTError("Unknown Option: %s", OptionName);
 426                 return(1);
 427         }
 428         //No Options used at the moment
 429         //return(0);
 430 }
 431
 432 #ifdef HLTCA_STANDALONE
 433 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)
 434 {
 435   //Run Performance Query for timer i of slice iSlice
 436   if (fDebugLevel >= 1)
 437   {
 438           AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
 439   }
 440 }
 441 #else
 442 void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
 443 #endif
 444
 445 void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
 446 {
 447         //Dump Rowblocks to File
 448         if (fDebugLevel >= 4)
 449         {
 450                 *fOutFile << "RowBlock Tracklets" << std::endl;
 451
 452                 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
 453                 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
 454                 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
 455                 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
 456                 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
 457                 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
 458                 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 459
 460                 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
 461                 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
 462                 {
 463                         *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
 464                                 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
 465                         for (int j = 0;j < rowBlockPos[i].x;j++)
 466                         {
 467                                 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
 468                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
 469                                 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
 470                                 {
 471                                         HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
 472                                 }
 473                                 k++;
 474                                 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
 475                                 {
 476                                         HLTError("Error, -1 Tracklet found");
 477                                 }
 478                         }
 479                         *fOutFile << std::endl << "Phase 2: ";
 480                         for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
 481                         {
 482                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
 483                         }
 484                         *fOutFile << std::endl;
 485                 }
 486
 487                 if (check)
 488                 {
 489                         *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
 490                         for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
 491                         {
 492                                 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
 493                         }
 494                 }
 495
 496                 free(rowBlockPos);
 497                 free(rowBlockTracklets);
 498                 free(blockStartingTracklet);
 499         }
 500 }
 501
 502 __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
 503 {
 504         //Initialize GPU RowBlocks and HitWeights
 505         int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
 506         int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
 507         const int stride = blockDim.x * gridDim.x;
 508         int4 i0, i1;
 509         i0.x = i0.y = i0.z = i0.w = 0;
 510         i1.x = i1.y = i1.z = i1.w = -1;
 511         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
 512                 RowBlockPos[i] = i0;
 513         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
 514                 rowBlockTracklets4[i] = i1;
 515         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
 516                 sliceDataHitWeights4[i] = i0;
 517 }
 518
 519 int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
 520 {
 521         //Primary reconstruction function
 522         cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 523
 524         if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
 525
 526         if (!fCudaInitialized)
 527         {
 528             HLTError("GPUTracker not initialized");
 529             return(1);
 530         }
 531         if (sliceCountLocal > fSliceCount)
 532         {
 533             HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
 534             return(1);
 535         }
 536         if (fThreadId != GetThread())
 537         {
 538             HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());
 539             return(1);
 540         }
 541
 542         if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
 543
 544         if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
 545         {
 546                 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
 547                 return(1);
 548         }
 549
 550         if (fDebugLevel >= 4)
 551         {
 552                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 553                 {
 554                         *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
 555                 }
 556         }
 557
 558         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
 559
 560         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
 561
 562 #ifdef HLTCA_GPU_TIME_PROFILE
 563         unsigned __int64 a, b, c, d;
 564         AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
 565         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
 566 #endif
 567
 568         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 569         {
 570                 //Make this a GPU Tracker
 571                 fGpuTracker[iSlice].SetGPUTracker();
 572                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
 573                 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
 574                 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
 575
 576                 //Set Pointers to GPU Memory
 577                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
 578
 579                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
 580                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
 581                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 582
 583                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
 584                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
 585                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 586
 587                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
 588                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
 589                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 590
 591                 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
 592                 {
 593                         HLTError("Insufficiant Track Memory");
 594                         return(1);
 595                 }
 596
 597                 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
 598                 {
 599                         HLTError("Insufficiant Global Memory");
 600                         return(1);
 601                 }
 602
 603                 //Initialize Startup Constants
 604                 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
 605                 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
 606                 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
 607                 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
 608                 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
 609                 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
 610                 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
 611                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
 612                 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
 613         }
 614
 615 #ifdef HLTCA_GPU_TEXTURE_FETCH
 616                 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
 617                 size_t offset;
 618                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 619                 {
 620                         HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
 621                         return(1);
 622                 }
 623                 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
 624                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 625                 {
 626                         HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
 627                         return(1);
 628                 }
 629                 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
 630                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 631                 {
 632                         HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
 633                         return(1);
 634                 }
 635 #endif
 636
 637         //Copy Tracker Object to GPU Memory
 638         if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
 639 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
 640         if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1);
 641         CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000));
 642 #endif
 643         CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
 644
 645         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 646         {
 647                 StandalonePerfTime(firstSlice + iSlice, 0);
 648
 649                 //Initialize GPU Slave Tracker
 650                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
 651                 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
 652 #ifdef HLTCA_GPU_TIME_PROFILE
 653                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
 654 #endif
 655                 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
 656
 657                   if (fDebugLevel >= 4)
 658                   {
 659                           *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
 660                           *fOutFile << "Slice Data:" << std::endl;
 661                           fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);
 662                   }
 663
 664 #ifdef HLTCA_GPU_TIME_PROFILE
 665                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
 666                 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
 667 #endif
 668                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
 669                 {
 670                         HLTError("Insufficiant Slice Data Memory");
 671                         return(1);
 672                 }
 673
 674                 /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice())
 675                 {
 676                         if (fDebugLevel >= 3) HLTInfo("Slice Empty, not running GPU Tracker");
 677                         if (sliceCountLocal == 1)
 678                                 return(0);
 679                 }*/
 680
 681                 //Initialize temporary memory where needed
 682                 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
 683                 PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
 684
 685                 //Copy Data to GPU Global Memory
 686                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 687                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 688                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 689
 690                 if (fDebugLevel >= 4)
 691                 {
 692                         if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
 693                         fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS);
 694                         fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
 695                 }
 696
 697                 if (CUDASync("Initialization")) return(1);
 698                 StandalonePerfTime(firstSlice + iSlice, 1);
 699
 700                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
 701                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 702
 703                 if (CUDASync("Neighbours finder")) return 1;
 704
 705                 StandalonePerfTime(firstSlice + iSlice, 2);
 706
 707                 if (fDebugLevel >= 4)
 708                 {
 709                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
 710                         fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
 711                 }
 712
 713                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
 714                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 715                 if (CUDASync("Neighbours Cleaner")) return 1;
 716
 717                 StandalonePerfTime(firstSlice + iSlice, 3);
 718
 719                 if (fDebugLevel >= 4)
 720                 {
 721                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
 722                         fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
 723                 }
 724
 725                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
 726                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 727                 if (CUDASync("Start Hits Finder")) return 1;
 728
 729                 StandalonePerfTime(firstSlice + iSlice, 4);
 730
 731                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter");
 732                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 733                 if (CUDASync("Start Hits Sorter")) return 1;
 734
 735                 StandalonePerfTime(firstSlice + iSlice, 5);
 736
 737                 if (fDebugLevel >= 2)
 738                 {
 739                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 740                         if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
 741                         if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
 742                         {
 743                                 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
 744                                 return(1);
 745                         }
 746                 }
 747
 748                 if (fDebugLevel >= 4)
 749                 {
 750                         *fOutFile << "Temporary ";
 751                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
 752                         fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
 753                         uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
 754                         CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
 755                         *fOutFile << "Start Hits Sort Vector:" << std::endl;
 756                         for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
 757                         {
 758                                 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
 759                         }
 760                         free(tmpMemory);
 761
 762                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
 763                         fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
 764                 }
 765
 766                 StandalonePerfTime(firstSlice + iSlice, 6);
 767
 768                 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
 769         }
 770
 771         StandalonePerfTime(firstSlice, 7);
 772 #ifdef HLTCA_GPU_PREFETCHDATA
 773         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 774         {
 775                 if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
 776                 {
 777                         HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
 778                         return(1);
 779                 }
 780                 if (fDebugLevel >= 1)
 781                 {
 782                         static int infoShown = 0;
 783                         if (!infoShown)
 784                         {
 785                                 HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
 786                                 infoShown = 1;
 787                         }
 788                 }
 789         }
 790 #endif
 791
 792         int nHardCollisions = 0;
 793
 794 RestartTrackletConstructor:
 795         if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
 796         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 797         {
 798                 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
 799                 if (CUDASync("Tracklet Initializer")) return 1;
 800                 DumpRowBlocks(fSlaveTrackers, iSlice);
 801         }
 802
 803         if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
 804         AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
 805         if (CUDASync("Tracklet Constructor (new)")) return 1;
 806
 807         StandalonePerfTime(firstSlice, 8);
 808
 809         if (fDebugLevel >= 4)
 810         {
 811                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 812                 {
 813                         DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
 814                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 815                         if (fDebugLevel >= 5)
 816                         {
 817                                 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
 818                         }
 819                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
 820                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
 821                         fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
 822                 }
 823         }
 824
 825         int runSlices = 0;
 826         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
 827         {
 828                 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
 829                 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
 830                 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
 831         }
 832         if (CUDASync("Tracklet Selector")) return 1;
 833         StandalonePerfTime(firstSlice, 9);
 834
 835         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0]));
 836         for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++)
 837         {
 838                 if (iSliceTmp < sliceCountLocal)
 839                 {
 840                         int iSlice = iSliceTmp;
 841                         if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
 842                         cudaStreamSynchronize(cudaStreams[iSlice]);
 843                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
 844                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
 845                         if (iSlice + 1 < sliceCountLocal)
 846                                 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1]));
 847                 }
 848
 849                 if (iSliceTmp)
 850                 {
 851                         int iSlice = iSliceTmp - 1;
 852                         cudaStreamSynchronize(cudaStreams[iSlice]);
 853
 854                         if (fDebugLevel >= 4)
 855                         {
 856                                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
 857                                 fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
 858                                 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
 859                         }
 860
 861                         if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
 862                         {
 863                                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION && nHardCollisions++ < 10)
 864                                 {
 865                                         HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");
 866                                         for (int i = 0;i < sliceCountLocal;i++)
 867                                         {
 868                                                 cudaThreadSynchronize();
 869                                                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
 870                                                 *fSlaveTrackers[firstSlice + i].NTracks() = 0;
 871                                                 *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
 872                                                 fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
 873                                                 CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
 874                                                 PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
 875                                         }
 876                                         goto RestartTrackletConstructor;
 877                                 }
 878                                 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
 879                                 return(1);
 880                         }
 881                         if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
 882
 883                         fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
 884 #ifdef HLTCA_GPU_TIME_PROFILE
 885                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
 886 #endif
 887                         fSlaveTrackers[firstSlice + iSlice].WriteOutput();
 888 #ifdef HLTCA_GPU_TIME_PROFILE
 889                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
 890                         printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
 891 #endif
 892
 893                         if (fDebugLevel >= 4)
 894                         {
 895                                 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
 896                                 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
 897                         }
 898                 }
 899         }
 900
 901         StandalonePerfTime(firstSlice, 10);
 902
 903         if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
 904
 905 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
 906         char* stageAtSync = (char*) malloc(100000000);
 907         CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
 908         cudaFree(fGpuTracker[0].fStageAtSync);
 909
 910         FILE* fp = fopen("profile.txt", "w+");
 911         FILE* fp2 = fopen("profile.bmp", "w+b");
 912         int nEmptySync = 0, fEmpty;
 913
 914         const int bmpheight = 1000;
 915         BITMAPFILEHEADER bmpFH;
 916         BITMAPINFOHEADER bmpIH;
 917         ZeroMemory(&bmpFH, sizeof(bmpFH));
 918         ZeroMemory(&bmpIH, sizeof(bmpIH));
 919
 920         bmpFH.bfType = 19778; //"BM"
 921         bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
 922         bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
 923
 924         bmpIH.biSize = sizeof(bmpIH);
 925         bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
 926         bmpIH.biHeight = bmpheight;
 927         bmpIH.biPlanes = 1;
 928         bmpIH.biBitCount = 32;
 929
 930         fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
 931         fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
 932
 933         for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
 934         {
 935                 fEmpty = 1;
 936                 for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
 937                 {
 938                         fprintf(fp, "%d\t", stageAtSync[i + j]);
 939                         int color = 0;
 940                         if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
 941                         if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
 942                         if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
 943                         if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
 944                         fwrite(&color, 1, sizeof(int), fp2);
 945                         if (j > 0 && j % 32 == 0)
 946                         {
 947                                 color = RGB(255, 255, 255);
 948                                 fwrite(&color, 1, 4, fp2);
 949                         }
 950                         if (stageAtSync[i + j]) fEmpty = 0;
 951                 }
 952                 fprintf(fp, "\n");
 953                 if (fEmpty) nEmptySync++;
 954                 else nEmptySync = 0;
 955                 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
 956         }
 957
 958         fclose(fp);
 959         fclose(fp2);
 960         free(stageAtSync);
 961 #endif
 962
 963         return(0);
 964 }
 965
 966 int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
 967 {
 968         //Initialize Slice Tracker Parameter for a slave tracker
 969         fSlaveTrackers[iSlice].Initialize(param);
 970         if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
 971         {
 972                 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
 973                 return(1);
 974         }
 975         return(0);
 976 }
 977
 978 int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
 979 {
 980         //Uninitialize CUDA
 981         cudaThreadSynchronize();
 982         if (fGPUMemory)
 983         {
 984                 cudaFree(fGPUMemory);
 985                 fGPUMemory = NULL;
 986         }
 987         if (fHostLockedMemory)
 988         {
 989                 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
 990                 {
 991                         cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
 992                 }
 993                 free(fpCudaStreams);
 994                 fGpuTracker = NULL;
 995                 cudaFreeHost(fHostLockedMemory);
 996         }
 997
 998         if (CudaFailedMsg(cudaThreadExit()))
 999         {
1000                 HLTError("Could not uninitialize GPU");
1001                 return(1);
1002         }
1003         HLTInfo("CUDA Uninitialized");
1004         fgGPUUsed = false;
1005         fCudaInitialized = 0;
1006         return(0);
1007 }
1008
1009 void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
1010 {
1011         //Set Output Control Pointers
1012         fOutputControl = val;
1013         for (int i = 0;i < fgkNSlices;i++)
1014         {
1015                 fSlaveTrackers[i].SetOutputControl(val);
1016         }
1017 }
1018
1019 int AliHLTTPCCAGPUTrackerNVCC::GetThread()
1020 {
1021         //Get Thread ID
1022 #ifdef R__WIN32
1023         return((int) (size_t) GetCurrentThread());
1024 #else
1025         return((int) syscall (SYS_gettid));
1026 #endif
1027 }
1028
1029 unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)
1030 {
1031         //Returns pointer to PerfTimer i of slice iSlice
1032         return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);
1033 }
1034
1035 const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const
1036 {
1037         //Return Pointer to Output Control Structure
1038         return fOutputControl;
1039 }
1040
1041 int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const
1042 {
1043         //Return max slice count processable
1044         return(fSliceCount);
1045 }
1046
1047 AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
1048 {
1049         return new AliHLTTPCCAGPUTrackerNVCC;
1050 }
1051 void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)
1052 {
1053         delete ptr;
1054 }
1055