HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu

   1 // **************************************************************************
   2 // This file is property of and copyright by the ALICE HLT Project          *
   3 // ALICE Experiment at CERN, All rights reserved.                           *
   4 //                                                                          *
   5 // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
   6 //                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
   7 //                                      David Rohr <drohr@kip.uni-heidelberg.de>                                *
   8 //                  for The ALICE HLT Project.                              *
   9 //                                                                          *
  10 // Permission to use, copy, modify and distribute this software and its     *
  11 // documentation strictly for non-commercial purposes is hereby granted     *
  12 // without fee, provided that the above copyright notice appears in all     *
  13 // copies and that both the copyright notice and this permission notice     *
  14 // appear in the supporting documentation. The authors make no claims       *
  15 // about the suitability of this software for any purpose. It is            *
  16 // provided "as is" without express or implied warranty.                    *
  17 //                                                                          *
  18 //***************************************************************************
  19
  20 #include "AliHLTTPCCAGPUTracker.h"
  21
  22 #ifdef BUILD_GPU
  23
  24 #include <cuda.h>
  25 #ifdef R__WIN32
  26 #else
  27 #include <sys/syscall.h>
  28 #include <semaphore.h>
  29 #include <fcntl.h>
  30 #endif
  31
  32 #include "AliHLTTPCCADef.h"
  33 #include "AliHLTTPCCAGPUConfig.h"
  34
  35 #include <sm_11_atomic_functions.h>
  36 #include <sm_12_atomic_functions.h>
  37
  38 #include <iostream>
  39
  40 //Disable assertions since they produce errors in GPU Code
  41 #ifdef assert
  42 #undef assert
  43 #endif
  44 #define assert(param)
  45
  46 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
  47 #ifdef HLTCA_GPU_TEXTURE_FETCH
  48 texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
  49 texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
  50 texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
  51 #endif
  52
  53 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
  54 #include "AliHLTTPCCATrackParam.cxx"
  55 #include "AliHLTTPCCATrack.cxx"
  56
  57 #include "AliHLTTPCCAHitArea.cxx"
  58 #include "AliHLTTPCCAGrid.cxx"
  59 #include "AliHLTTPCCARow.cxx"
  60 #include "AliHLTTPCCAParam.cxx"
  61 #include "AliHLTTPCCATracker.cxx"
  62
  63 #include "AliHLTTPCCAProcess.h"
  64
  65 #include "AliHLTTPCCATrackletSelector.cxx"
  66 #include "AliHLTTPCCANeighboursFinder.cxx"
  67 #include "AliHLTTPCCANeighboursCleaner.cxx"
  68 #include "AliHLTTPCCAStartHitsFinder.cxx"
  69 #include "AliHLTTPCCAStartHitsSorter.cxx"
  70 #include "AliHLTTPCCATrackletConstructor.cxx"
  71 #include "AliHLTTPCCASliceOutput.cxx"
  72
  73 #include "MemoryAssignmentHelpers.h"
  74
  75 #ifndef HLTCA_STANDALONE
  76 #include "AliHLTDefinitions.h"
  77 #include "AliHLTSystem.h"
  78 #endif
  79
  80 ClassImp( AliHLTTPCCAGPUTracker )
  81
  82 bool AliHLTTPCCAGPUTracker::fgGPUUsed = false;
  83
  84 #define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
  85
  86 void AliHLTTPCCAGPUTracker::ReleaseGlobalLock(void* sem)
  87 {
  88         //Release the global named semaphore that locks GPU Initialization
  89 #ifdef R__WIN32
  90         HANDLE* h = (HANDLE*) sem;
  91         ReleaseSemaphore(*h, 1, NULL);
  92         CloseHandle(*h);
  93         delete h;
  94 #else
  95         sem_t* pSem = (sem_t*) sem;
  96         sem_post(pSem);
  97         sem_unlink(SemLockName);
  98 #endif
  99 }
 100
 101 int AliHLTTPCCAGPUTracker::CheckMemorySizes(int sliceCount)
 102 {
 103         //Check constants for correct memory sizes
 104   if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
 105   {
 106           HLTError("Insufficiant Tracker Object Memory");
 107           return(1);
 108   }
 109
 110   if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
 111   {
 112           HLTError("Insufficiant Common Memory");
 113           return(1);
 114   }
 115
 116   if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
 117   {
 118           HLTError("Insufficiant Row Memory");
 119           return(1);
 120   }
 121   return(0);
 122 }
 123
 124 int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
 125 {
 126         //Find best CUDA device, initialize and allocate memory
 127
 128         if (CheckMemorySizes(sliceCount)) return(1);
 129
 130 #ifdef R__WIN32
 131         HANDLE* semLock = new HANDLE;
 132         *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
 133         if (*semLock == NULL)
 134         {
 135                 HLTError("Error creating GPUInit Semaphore");
 136                 return(1);
 137         }
 138         WaitForSingleObject(*semLock, INFINITE);
 139 #else
 140         sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
 141         if (semLock == SEM_FAILED)
 142         {
 143                 HLTError("Error creating GPUInit Semaphore");
 144                 return(1);
 145         }
 146         sem_wait(semLock);
 147 #endif
 148
 149         if (fgGPUUsed)
 150         {
 151             HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
 152                 ReleaseGlobalLock(semLock);
 153             return(1);
 154         }
 155         fgGPUUsed = 1;
 156         fThreadId = GetThread();
 157
 158         cudaDeviceProp fCudaDeviceProp;
 159
 160         fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
 161
 162 #ifndef CUDA_DEVICE_EMULATION
 163         int count, bestDevice = -1;
 164         long long int bestDeviceSpeed = 0, deviceSpeed;
 165         if (CudaFailedMsg(cudaGetDeviceCount(&count)))
 166         {
 167                 HLTError("Error getting CUDA Device Count");
 168                 fgGPUUsed = 0;
 169                 ReleaseGlobalLock(semLock);
 170                 return(1);
 171         }
 172         if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
 173         for (int i = 0;i < count;i++)
 174         {
 175                 unsigned int free, total;
 176                 cuInit(0);
 177                 CUdevice tmpDevice;
 178                 cuDeviceGet(&tmpDevice, i);
 179                 CUcontext tmpContext;
 180                 cuCtxCreate(&tmpContext, 0, tmpDevice);
 181                 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
 182                 cuCtxDestroy(tmpContext);
 183                 CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i));
 184
 185                 int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;
 186
 187                 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %d)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
 188                 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;
 189                 if (deviceOK && deviceSpeed > bestDeviceSpeed)
 190                 {
 191                         bestDevice = i;
 192                         bestDeviceSpeed = deviceSpeed;
 193                 }
 194         }
 195         if (bestDevice == -1)
 196         {
 197                 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
 198                 fgGPUUsed = 0;
 199                 ReleaseGlobalLock(semLock);
 200                 return(1);
 201         }
 202
 203   int cudaDevice;
 204   if (forceDeviceID == -1)
 205           cudaDevice = bestDevice;
 206   else
 207           cudaDevice = forceDeviceID;
 208 #else
 209         int cudaDevice = 0;
 210 #endif
 211
 212   cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice );
 213
 214   if (fDebugLevel >= 1)
 215   {
 216           HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
 217           HLTInfo("totalGlobalMem = %d", fCudaDeviceProp.totalGlobalMem);
 218           HLTInfo("sharedMemPerBlock = %d", fCudaDeviceProp.sharedMemPerBlock);
 219           HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
 220           HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
 221           HLTInfo("memPitch = %d", fCudaDeviceProp.memPitch);
 222           HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
 223           HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
 224           HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
 225           HLTInfo("totalConstMem = %d", fCudaDeviceProp.totalConstMem);
 226           HLTInfo("major = %d", fCudaDeviceProp.major);
 227           HLTInfo("minor = %d", fCudaDeviceProp.minor);
 228           HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);
 229           HLTInfo("textureAlignment %d= ", fCudaDeviceProp.textureAlignment);
 230   }
 231
 232   if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
 233   {
 234         HLTError( "Unsupported CUDA Device" );
 235         fgGPUUsed = 0;
 236         ReleaseGlobalLock(semLock);
 237         return(1);
 238   }
 239
 240   if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
 241   {
 242           HLTError("Could not set CUDA Device!");
 243           fgGPUUsed = 0;
 244           ReleaseGlobalLock(semLock);
 245           return(1);
 246   }
 247
 248   if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
 249   {
 250           HLTError("CUDA Memory Allocation Error");
 251           cudaThreadExit();
 252           fgGPUUsed = 0;
 253           ReleaseGlobalLock(semLock);
 254           return(1);
 255   }
 256   ReleaseGlobalLock(semLock);
 257   if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
 258   int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
 259   if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
 260   {
 261           cudaFree(fGPUMemory);
 262           cudaThreadExit();
 263           HLTError("Error allocating Page Locked Host Memory");
 264           fgGPUUsed = 0;
 265           return(1);
 266   }
 267   if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
 268
 269   if (fDebugLevel >= 1)
 270   {
 271           CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
 272   }
 273
 274   fSliceCount = sliceCount;
 275   //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
 276   fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
 277
 278   for (int i = 0;i < fgkNSlices;i++)
 279   {
 280     fSlaveTrackers[i].SetGPUTracker();
 281         fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
 282         fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
 283   }
 284
 285   fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
 286   cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 287   for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
 288   {
 289         if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
 290         {
 291             cudaFree(fGPUMemory);
 292             cudaFreeHost(fHostLockedMemory);
 293             cudaThreadExit();
 294             HLTError("Error creating CUDA Stream");
 295             fgGPUUsed = 0;
 296             return(1);
 297         }
 298   }
 299
 300   fCudaInitialized = 1;
 301   HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %dd)", cudaDevice, fCudaDeviceProp.name, fThreadId);
 302
 303 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
 304   if (fDebugLevel < 2)
 305   {
 306           //Do one initial run for Benchmark reasons
 307           const int useDebugLevel = fDebugLevel;
 308           fDebugLevel = 0;
 309           AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
 310
 311           std::ifstream fin;
 312
 313           AliHLTTPCCAParam tmpParam;
 314           AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
 315
 316           fin.open("events/settings.dump");
 317           int tmpCount;
 318           fin >> tmpCount;
 319           for (int i = 0;i < sliceCount;i++)
 320           {
 321                 fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
 322                 tmpParam.ReadSettings(fin);
 323                 InitializeSliceParam(i, tmpParam);
 324           }
 325           fin.close();
 326
 327           fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
 328           for (int i = 0;i < sliceCount;i++)
 329           {
 330                 tmpCluster[i].StartReading(i, 0);
 331                 tmpCluster[i].ReadEvent(fin);
 332                 tmpCluster[i].FinishReading();
 333           }
 334           fin.close();
 335
 336           AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
 337           memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
 338
 339           Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
 340           for (int i = 0;i < sliceCount;i++)
 341           {
 342                   free(tmpOutput[i]);
 343                   tmpOutput[i] = NULL;
 344                   fSlaveTrackers[i].SetOutputControl(NULL);
 345           }
 346           delete[] tmpOutput;
 347           delete[] tmpCluster;
 348           fDebugLevel = useDebugLevel;
 349   }
 350 #endif
 351   return(0);
 352 }
 353
 354 template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment)
 355 {
 356         //Macro to align Pointers.
 357         //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
 358         //(As long as every single data structure is <= 1 MB)
 359
 360         size_t adr = (size_t) ptr;
 361         if (adr % alignment)
 362         {
 363                 adr += alignment - (adr % alignment);
 364         }
 365         return((T*) adr);
 366 }
 367
 368 bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
 369 {
 370         //Check for CUDA Error and in the case of an error display the corresponding error string
 371         if (error == cudaSuccess) return(false);
 372         HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
 373         return(true);
 374 }
 375
 376 int AliHLTTPCCAGPUTracker::CUDASync(char* state)
 377 {
 378         //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
 379
 380         if (fDebugLevel == 0) return(0);
 381         cudaError cuErr;
 382         cuErr = cudaGetLastError();
 383         if (cuErr != cudaSuccess)
 384         {
 385                 HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state);
 386                 return(1);
 387         }
 388         if (CudaFailedMsg(cudaThreadSynchronize()))
 389         {
 390                 HLTError("CUDA Error while synchronizing (%s)", state);
 391                 return(1);
 392         }
 393         if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
 394         return(0);
 395 }
 396
 397 void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
 398 {
 399         //Set Debug Level and Debug output File if applicable
 400         fDebugLevel = dwLevel;
 401         if (NewOutFile) fOutFile = NewOutFile;
 402 }
 403
 404 int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
 405 {
 406         //Set a specific GPU Tracker Option
 407         {
 408                 HLTError("Unknown Option: %s", OptionName);
 409                 return(1);
 410         }
 411         //No Options used at the moment
 412         //return(0);
 413 }
 414
 415 #ifdef HLTCA_STANDALONE
 416 void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
 417 {
 418   //Run Performance Query for timer i of slice iSlice
 419   if (fDebugLevel >= 1)
 420   {
 421           AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
 422   }
 423 }
 424 #else
 425 void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
 426 #endif
 427
 428 void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
 429 {
 430         //Dump Rowblocks to File
 431         if (fDebugLevel >= 4)
 432         {
 433                 *fOutFile << "RowBlock Tracklets" << std::endl;
 434
 435                 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
 436                 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
 437                 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
 438                 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
 439                 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
 440                 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
 441                 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 442
 443                 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
 444                 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
 445                 {
 446                         *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
 447                                 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
 448                         for (int j = 0;j < rowBlockPos[i].x;j++)
 449                         {
 450                                 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
 451                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
 452                                 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
 453                                 {
 454                                         HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
 455                                 }
 456                                 k++;
 457                                 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
 458                                 {
 459                                         HLTError("Error, -1 Tracklet found");
 460                                 }
 461                         }
 462                         *fOutFile << std::endl << "Phase 2: ";
 463                         for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
 464                         {
 465                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
 466                         }
 467                         *fOutFile << std::endl;
 468                 }
 469
 470                 if (check)
 471                 {
 472                         *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
 473                         for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
 474                         {
 475                                 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
 476                         }
 477                 }
 478
 479                 free(rowBlockPos);
 480                 free(rowBlockTracklets);
 481                 free(blockStartingTracklet);
 482         }
 483 }
 484
 485 __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
 486 {
 487         //Initialize GPU RowBlocks and HitWeights
 488         int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
 489         int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
 490         const int stride = blockDim.x * gridDim.x;
 491         int4 i0, i1;
 492         i0.x = i0.y = i0.z = i0.w = 0;
 493         i1.x = i1.y = i1.z = i1.w = -1;
 494         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
 495                 RowBlockPos[i] = i0;
 496         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
 497                 rowBlockTracklets4[i] = i1;
 498         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
 499                 sliceDataHitWeights4[i] = i0;
 500 }
 501
 502 int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
 503 {
 504         //Primary reconstruction function
 505         cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 506
 507         if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
 508
 509         if (!fCudaInitialized)
 510         {
 511             HLTError("GPUTracker not initialized");
 512             return(1);
 513         }
 514         if (sliceCountLocal > fSliceCount)
 515         {
 516             HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
 517             return(1);
 518         }
 519         if (fThreadId != GetThread())
 520         {
 521             HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());
 522             return(1);
 523         }
 524
 525         if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
 526
 527         if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
 528         {
 529                 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
 530                 return(1);
 531         }
 532
 533         if (fDebugLevel >= 4)
 534         {
 535                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 536                 {
 537                         *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
 538                 }
 539         }
 540
 541         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
 542
 543         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
 544
 545 #ifdef HLTCA_GPU_TIME_PROFILE
 546         unsigned __int64 a, b, c, d;
 547         AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
 548         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
 549 #endif
 550
 551         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 552         {
 553                 //Make this a GPU Tracker
 554                 fGpuTracker[iSlice].SetGPUTracker();
 555                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
 556                 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
 557                 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
 558
 559                 //Set Pointers to GPU Memory
 560                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
 561
 562                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
 563                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
 564                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 565
 566                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
 567                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
 568                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 569
 570                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
 571                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
 572                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 573
 574                 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
 575                 {
 576                         HLTError("Insufficiant Track Memory");
 577                         return(1);
 578                 }
 579
 580                 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
 581                 {
 582                         HLTError("Insufficiant Global Memory");
 583                         return(1);
 584                 }
 585
 586                 //Initialize Startup Constants
 587                 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
 588                 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
 589                 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
 590                 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
 591                 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
 592                 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
 593                 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
 594                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
 595                 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
 596         }
 597
 598 #ifdef HLTCA_GPU_TEXTURE_FETCH
 599                 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
 600                 size_t offset;
 601                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 602                 {
 603                         HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
 604                         return(1);
 605                 }
 606                 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
 607                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 608                 {
 609                         HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
 610                         return(1);
 611                 }
 612                 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
 613                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 614                 {
 615                         HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
 616                         return(1);
 617                 }
 618 #endif
 619
 620         //Copy Tracker Object to GPU Memory
 621         if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
 622 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
 623         if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1);
 624         CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000));
 625 #endif
 626         CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
 627
 628         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 629         {
 630                 StandalonePerfTime(firstSlice + iSlice, 0);
 631
 632                 //Initialize GPU Slave Tracker
 633                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
 634                 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
 635 #ifdef HLTCA_GPU_TIME_PROFILE
 636                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
 637 #endif
 638                 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
 639 #ifdef HLTCA_GPU_TIME_PROFILE
 640                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
 641                 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
 642 #endif
 643                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
 644                 {
 645                         HLTError("Insufficiant Slice Data Memory");
 646                         return(1);
 647                 }
 648
 649                 /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice())
 650                 {
 651                         if (fDebugLevel >= 3) HLTInfo("Slice Empty, not running GPU Tracker");
 652                         if (sliceCountLocal == 1)
 653                                 return(0);
 654                 }*/
 655
 656                 //Initialize temporary memory where needed
 657                 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
 658                 PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
 659
 660                 //Copy Data to GPU Global Memory
 661                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 662                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 663                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 664
 665                 if (fDebugLevel >= 4)
 666                 {
 667                         if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
 668                         fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS);
 669                         fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
 670                 }
 671
 672                 if (CUDASync("Initialization")) return(1);
 673                 StandalonePerfTime(firstSlice + iSlice, 1);
 674
 675                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
 676                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 677
 678                 if (CUDASync("Neighbours finder")) return 1;
 679
 680                 StandalonePerfTime(firstSlice + iSlice, 2);
 681
 682                 if (fDebugLevel >= 4)
 683                 {
 684                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
 685                         fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
 686                 }
 687
 688                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
 689                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 690                 if (CUDASync("Neighbours Cleaner")) return 1;
 691
 692                 StandalonePerfTime(firstSlice + iSlice, 3);
 693
 694                 if (fDebugLevel >= 4)
 695                 {
 696                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
 697                         fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
 698                 }
 699
 700                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
 701                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 702                 if (CUDASync("Start Hits Finder")) return 1;
 703
 704                 StandalonePerfTime(firstSlice + iSlice, 4);
 705
 706                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter");
 707                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 708                 if (CUDASync("Start Hits Sorter")) return 1;
 709
 710                 StandalonePerfTime(firstSlice + iSlice, 5);
 711
 712                 if (fDebugLevel >= 2)
 713                 {
 714                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 715                         if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
 716                         if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
 717                         {
 718                                 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
 719                                 return(1);
 720                         }
 721                 }
 722
 723                 if (fDebugLevel >= 4)
 724                 {
 725                         *fOutFile << "Temporary ";
 726                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
 727                         fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
 728                         uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
 729                         CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
 730                         *fOutFile << "Start Hits Sort Vector:" << std::endl;
 731                         for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
 732                         {
 733                                 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
 734                         }
 735                         free(tmpMemory);
 736
 737                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
 738                         fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
 739                 }
 740
 741                 StandalonePerfTime(firstSlice + iSlice, 6);
 742
 743                 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
 744         }
 745
 746         StandalonePerfTime(firstSlice, 7);
 747 #ifdef HLTCA_GPU_PREFETCHDATA
 748         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 749         {
 750                 if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
 751                 {
 752                         HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
 753                         return(1);
 754                 }
 755                 if (fDebugLevel >= 1)
 756                 {
 757                         static int infoShown = 0;
 758                         if (!infoShown)
 759                         {
 760                                 HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
 761                                 infoShown = 1;
 762                         }
 763                 }
 764         }
 765 #endif
 766
 767         int nHardCollisions = 0;
 768
 769 RestartTrackletConstructor:
 770         if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
 771         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 772         {
 773                 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
 774                 if (CUDASync("Tracklet Initializer")) return 1;
 775                 DumpRowBlocks(fSlaveTrackers, iSlice);
 776         }
 777
 778         if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
 779         AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
 780         if (CUDASync("Tracklet Constructor (new)")) return 1;
 781
 782         StandalonePerfTime(firstSlice, 8);
 783
 784         if (fDebugLevel >= 4)
 785         {
 786                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 787                 {
 788                         DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
 789                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 790                         if (fDebugLevel >= 5)
 791                         {
 792                                 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
 793                         }
 794                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
 795                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
 796                         fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
 797                 }
 798         }
 799
 800         int runSlices = 0;
 801         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
 802         {
 803                 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
 804                 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
 805                 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
 806         }
 807         if (CUDASync("Tracklet Selector")) return 1;
 808         StandalonePerfTime(firstSlice, 9);
 809
 810         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0]));
 811         for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++)
 812         {
 813                 if (iSliceTmp < sliceCountLocal)
 814                 {
 815                         int iSlice = iSliceTmp;
 816                         if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
 817                         cudaStreamSynchronize(cudaStreams[iSlice]);
 818                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
 819                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
 820                         if (iSlice + 1 < sliceCountLocal)
 821                                 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1]));
 822                 }
 823
 824                 if (iSliceTmp)
 825                 {
 826                         int iSlice = iSliceTmp - 1;
 827                         cudaStreamSynchronize(cudaStreams[iSlice]);
 828
 829                         if (fDebugLevel >= 4)
 830                         {
 831                                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
 832                                 fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
 833                                 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
 834                         }
 835
 836                         if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
 837                         {
 838                                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION && nHardCollisions++ < 10)
 839                                 {
 840                                         HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");
 841                                         for (int i = 0;i < sliceCountLocal;i++)
 842                                         {
 843                                                 cudaThreadSynchronize();
 844                                                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
 845                                                 *fSlaveTrackers[firstSlice + i].NTracks() = 0;
 846                                                 *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
 847                                                 fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
 848                                                 CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
 849                                                 PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
 850                                         }
 851                                         goto RestartTrackletConstructor;
 852                                 }
 853                                 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
 854                                 return(1);
 855                         }
 856                         if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
 857
 858                         fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
 859 #ifdef HLTCA_GPU_TIME_PROFILE
 860                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
 861 #endif
 862                         fSlaveTrackers[firstSlice + iSlice].WriteOutput();
 863 #ifdef HLTCA_GPU_TIME_PROFILE
 864                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
 865                         printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
 866 #endif
 867
 868                         if (fDebugLevel >= 4)
 869                         {
 870                                 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
 871                                 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
 872                         }
 873                 }
 874         }
 875
 876         StandalonePerfTime(firstSlice, 10);
 877
 878         if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
 879
 880 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
 881         char* stageAtSync = (char*) malloc(100000000);
 882         CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
 883         cudaFree(fGpuTracker[0].fStageAtSync);
 884
 885         FILE* fp = fopen("profile.txt", "w+");
 886         FILE* fp2 = fopen("profile.bmp", "w+b");
 887         int nEmptySync = 0, fEmpty;
 888
 889         const int bmpheight = 1000;
 890         BITMAPFILEHEADER bmpFH;
 891         BITMAPINFOHEADER bmpIH;
 892         ZeroMemory(&bmpFH, sizeof(bmpFH));
 893         ZeroMemory(&bmpIH, sizeof(bmpIH));
 894
 895         bmpFH.bfType = 19778; //"BM"
 896         bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
 897         bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
 898
 899         bmpIH.biSize = sizeof(bmpIH);
 900         bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
 901         bmpIH.biHeight = bmpheight;
 902         bmpIH.biPlanes = 1;
 903         bmpIH.biBitCount = 32;
 904
 905         fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
 906         fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
 907
 908         for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
 909         {
 910                 fEmpty = 1;
 911                 for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
 912                 {
 913                         fprintf(fp, "%d\t", stageAtSync[i + j]);
 914                         int color = 0;
 915                         if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
 916                         if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
 917                         if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
 918                         if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
 919                         fwrite(&color, 1, sizeof(int), fp2);
 920                         if (j > 0 && j % 32 == 0)
 921                         {
 922                                 color = RGB(255, 255, 255);
 923                                 fwrite(&color, 1, 4, fp2);
 924                         }
 925                         if (stageAtSync[i + j]) fEmpty = 0;
 926                 }
 927                 fprintf(fp, "\n");
 928                 if (fEmpty) nEmptySync++;
 929                 else nEmptySync = 0;
 930                 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
 931         }
 932
 933         fclose(fp);
 934         fclose(fp2);
 935         free(stageAtSync);
 936 #endif
 937
 938         return(0);
 939 }
 940
 941 int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
 942 {
 943         //Initialize Slice Tracker Parameter for a slave tracker
 944         fSlaveTrackers[iSlice].Initialize(param);
 945         if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
 946         {
 947                 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
 948                 return(1);
 949         }
 950         return(0);
 951 }
 952
 953 int AliHLTTPCCAGPUTracker::ExitGPU()
 954 {
 955         //Uninitialize CUDA
 956         cudaThreadSynchronize();
 957         if (fGPUMemory)
 958         {
 959                 cudaFree(fGPUMemory);
 960                 fGPUMemory = NULL;
 961         }
 962         if (fHostLockedMemory)
 963         {
 964                 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
 965                 {
 966                         cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
 967                 }
 968                 free(fpCudaStreams);
 969                 fGpuTracker = NULL;
 970                 cudaFreeHost(fHostLockedMemory);
 971         }
 972
 973         if (CudaFailedMsg(cudaThreadExit()))
 974         {
 975                 HLTError("Could not uninitialize GPU");
 976                 return(1);
 977         }
 978         HLTInfo("CUDA Uninitialized");
 979         fgGPUUsed = false;
 980         fCudaInitialized = 0;
 981         return(0);
 982 }
 983
 984 void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
 985 {
 986         //Set Output Control Pointers
 987         fOutputControl = val;
 988         for (int i = 0;i < fgkNSlices;i++)
 989         {
 990                 fSlaveTrackers[i].SetOutputControl(val);
 991         }
 992 }
 993
 994 int AliHLTTPCCAGPUTracker::GetThread()
 995 {
 996         //Get Thread ID
 997 #ifdef R__WIN32
 998         return((int) (size_t) GetCurrentThread());
 999 #else
1000         return((int) syscall (SYS_gettid));
1001 #endif
1002 }
1003
1004 #endif