HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu

   1 // **************************************************************************
   2 // This file is property of and copyright by the ALICE HLT Project          *
   3 // ALICE Experiment at CERN, All rights reserved.                           *
   4 //                                                                          *
   5 // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
   6 //                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
   7 //                                      David Rohr <drohr@kip.uni-heidelberg.de>                                *
   8 //                  for The ALICE HLT Project.                              *
   9 //                                                                          *
  10 // Permission to use, copy, modify and distribute this software and its     *
  11 // documentation strictly for non-commercial purposes is hereby granted     *
  12 // without fee, provided that the above copyright notice appears in all     *
  13 // copies and that both the copyright notice and this permission notice     *
  14 // appear in the supporting documentation. The authors make no claims       *
  15 // about the suitability of this software for any purpose. It is            *
  16 // provided "as is" without express or implied warranty.                    *
  17 //                                                                          *
  18 //***************************************************************************
  19
  20 #include "AliHLTTPCCAGPUTracker.h"
  21
  22 #ifdef BUILD_GPU
  23
  24 #include <cuda.h>
  25 #ifdef R__WIN32
  26
  27 #else
  28 #include <sys/syscall.h>
  29 #endif
  30
  31 #include "AliHLTTPCCADef.h"
  32 #include "AliHLTTPCCAGPUConfig.h"
  33
  34 #include <sm_11_atomic_functions.h>
  35 #include <sm_12_atomic_functions.h>
  36
  37 #include <iostream>
  38
  39 //Disable assertions since they produce errors in GPU Code
  40 #ifdef assert
  41 #undef assert
  42 #endif
  43 #define assert(param)
  44
  45 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
  46 #ifdef HLTCA_GPU_TEXTURE_FETCH
  47 texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
  48 texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
  49 texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
  50 #endif
  51
  52 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
  53 #include "AliHLTTPCCATrackParam.cxx"
  54 #include "AliHLTTPCCATrack.cxx"
  55
  56 #include "AliHLTTPCCAHitArea.cxx"
  57 #include "AliHLTTPCCAGrid.cxx"
  58 #include "AliHLTTPCCARow.cxx"
  59 #include "AliHLTTPCCAParam.cxx"
  60 #include "AliHLTTPCCATracker.cxx"
  61
  62 #include "AliHLTTPCCAProcess.h"
  63
  64 #include "AliHLTTPCCATrackletSelector.cxx"
  65 #include "AliHLTTPCCANeighboursFinder.cxx"
  66 #include "AliHLTTPCCANeighboursCleaner.cxx"
  67 #include "AliHLTTPCCAStartHitsFinder.cxx"
  68 #include "AliHLTTPCCAStartHitsSorter.cxx"
  69 #include "AliHLTTPCCATrackletConstructor.cxx"
  70 #include "AliHLTTPCCASliceOutput.cxx"
  71
  72 #include "MemoryAssignmentHelpers.h"
  73
  74 #ifndef HLTCA_STANDALONE
  75 #include "AliHLTDefinitions.h"
  76 #include "AliHLTSystem.h"
  77 #endif
  78
  79 ClassImp( AliHLTTPCCAGPUTracker )
  80
  81 bool AliHLTTPCCAGPUTracker::fgGPUUsed = false;
  82
  83 int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
  84 {
  85         //Find best CUDA device, initialize and allocate memory
  86
  87         if (fgGPUUsed)
  88         {
  89             HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
  90             return(1);
  91         }
  92         fgGPUUsed = 1;
  93         fThreadId = GetThread();
  94
  95         cudaDeviceProp fCudaDeviceProp;
  96
  97         fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
  98
  99 #ifndef CUDA_DEVICE_EMULATION
 100         int count, bestDevice = -1;
 101         long long int bestDeviceSpeed = 0, deviceSpeed;
 102         if (CudaFailedMsg(cudaGetDeviceCount(&count)))
 103         {
 104                 HLTError("Error getting CUDA Device Count");
 105                 fgGPUUsed = 0;
 106                 return(1);
 107         }
 108         if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
 109         for (int i = 0;i < count;i++)
 110         {
 111                 unsigned int free, total;
 112                 cuInit(0);
 113                 CUdevice tmpDevice;
 114                 cuDeviceGet(&tmpDevice, i);
 115                 CUcontext tmpContext;
 116                 cuCtxCreate(&tmpContext, 0, tmpDevice);
 117                 if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
 118                 cuCtxDestroy(tmpContext);
 119                 CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i));
 120
 121                 int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;
 122
 123                 if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %d)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
 124                 deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;
 125                 if (deviceOK && deviceSpeed > bestDeviceSpeed)
 126                 {
 127                         bestDevice = i;
 128                         bestDeviceSpeed = deviceSpeed;
 129                 }
 130         }
 131         if (bestDevice == -1)
 132         {
 133                 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
 134                 fgGPUUsed = 0;
 135                 return(1);
 136         }
 137
 138   int cudaDevice;
 139   if (forceDeviceID == -1)
 140           cudaDevice = bestDevice;
 141   else
 142           cudaDevice = forceDeviceID;
 143 #else
 144         int cudaDevice = 0;
 145 #endif
 146
 147   cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice );
 148
 149   if (fDebugLevel >= 1)
 150   {
 151           HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
 152           HLTInfo("totalGlobalMem = %d", fCudaDeviceProp.totalGlobalMem);
 153           HLTInfo("sharedMemPerBlock = %d", fCudaDeviceProp.sharedMemPerBlock);
 154           HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
 155           HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
 156           HLTInfo("memPitch = %d", fCudaDeviceProp.memPitch);
 157           HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
 158           HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
 159           HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
 160           HLTInfo("totalConstMem = %d", fCudaDeviceProp.totalConstMem);
 161           HLTInfo("major = %d", fCudaDeviceProp.major);
 162           HLTInfo("minor = %d", fCudaDeviceProp.minor);
 163           HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);
 164           HLTInfo("textureAlignment %d= ", fCudaDeviceProp.textureAlignment);
 165   }
 166
 167   if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
 168   {
 169         HLTError( "Unsupported CUDA Device" );
 170         fgGPUUsed = 0;
 171         return(1);
 172   }
 173
 174   if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
 175   {
 176           HLTError("Insufficiant Tracker Object Memory");
 177           fgGPUUsed = 0;
 178           return(1);
 179   }
 180
 181   if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
 182   {
 183           HLTError("Could not set CUDA Device!");
 184           fgGPUUsed = 0;
 185           return(1);
 186   }
 187
 188   if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
 189   {
 190           HLTError("Insufficiant Common Memory");
 191           cudaThreadExit();
 192           fgGPUUsed = 0;
 193           return(1);
 194   }
 195
 196   if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
 197   {
 198           HLTError("Insufficiant Row Memory");
 199           cudaThreadExit();
 200           fgGPUUsed = 0;
 201           return(1);
 202   }
 203
 204   if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
 205   {
 206           HLTError("CUDA Memory Allocation Error");
 207           cudaThreadExit();
 208           fgGPUUsed = 0;
 209           return(1);
 210   }
 211   if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
 212   int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
 213   if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
 214   {
 215           cudaFree(fGPUMemory);
 216           cudaThreadExit();
 217           HLTError("Error allocating Page Locked Host Memory");
 218           fgGPUUsed = 0;
 219           return(1);
 220   }
 221   if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
 222
 223   if (fDebugLevel >= 1)
 224   {
 225           CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
 226   }
 227
 228   fSliceCount = sliceCount;
 229   //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
 230   fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
 231
 232   for (int i = 0;i < fgkNSlices;i++)
 233   {
 234     fSlaveTrackers[i].SetGPUTracker();
 235         fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
 236         fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
 237   }
 238
 239   fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
 240   cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 241   for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
 242   {
 243         if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
 244         {
 245             cudaFree(fGPUMemory);
 246             cudaFreeHost(fHostLockedMemory);
 247             cudaThreadExit();
 248             HLTError("Error creating CUDA Stream");
 249             fgGPUUsed = 0;
 250             return(1);
 251         }
 252   }
 253
 254   fCudaInitialized = 1;
 255   HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %dd)", cudaDevice, fCudaDeviceProp.name, fThreadId);
 256
 257 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
 258   if (fDebugLevel < 2)
 259   {
 260           //Do one initial run for Benchmark reasons
 261           const int useDebugLevel = fDebugLevel;
 262           fDebugLevel = 0;
 263           AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
 264
 265           std::ifstream fin;
 266
 267           AliHLTTPCCAParam tmpParam;
 268           AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
 269
 270           fin.open("events/settings.dump");
 271           int tmpCount;
 272           fin >> tmpCount;
 273           for (int i = 0;i < sliceCount;i++)
 274           {
 275                 fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
 276                 tmpParam.ReadSettings(fin);
 277                 InitializeSliceParam(i, tmpParam);
 278           }
 279           fin.close();
 280
 281           fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
 282           for (int i = 0;i < sliceCount;i++)
 283           {
 284                 tmpCluster[i].StartReading(i, 0);
 285                 tmpCluster[i].ReadEvent(fin);
 286                 tmpCluster[i].FinishReading();
 287           }
 288           fin.close();
 289
 290           AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
 291           memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
 292
 293           Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
 294           for (int i = 0;i < sliceCount;i++)
 295           {
 296                   free(tmpOutput[i]);
 297                   tmpOutput[i] = NULL;
 298                   fSlaveTrackers[i].SetOutputControl(NULL);
 299           }
 300           delete[] tmpOutput;
 301           delete[] tmpCluster;
 302           fDebugLevel = useDebugLevel;
 303   }
 304 #endif
 305   return(0);
 306 }
 307
 308 template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment)
 309 {
 310         //Macro to align Pointers.
 311         //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
 312         //(As long as every single data structure is <= 1 MB)
 313
 314         size_t adr = (size_t) ptr;
 315         if (adr % alignment)
 316         {
 317                 adr += alignment - (adr % alignment);
 318         }
 319         return((T*) adr);
 320 }
 321
 322 bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
 323 {
 324         //Check for CUDA Error and in the case of an error display the corresponding error string
 325         if (error == cudaSuccess) return(false);
 326         HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
 327         return(true);
 328 }
 329
 330 int AliHLTTPCCAGPUTracker::CUDASync(char* state)
 331 {
 332         //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
 333
 334         if (fDebugLevel == 0) return(0);
 335         cudaError cuErr;
 336         cuErr = cudaGetLastError();
 337         if (cuErr != cudaSuccess)
 338         {
 339                 HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state);
 340                 return(1);
 341         }
 342         if (CudaFailedMsg(cudaThreadSynchronize()))
 343         {
 344                 HLTError("CUDA Error while synchronizing (%s)", state);
 345                 return(1);
 346         }
 347         if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
 348         return(0);
 349 }
 350
 351 void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
 352 {
 353         //Set Debug Level and Debug output File if applicable
 354         fDebugLevel = dwLevel;
 355         if (NewOutFile) fOutFile = NewOutFile;
 356 }
 357
 358 int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
 359 {
 360         //Set a specific GPU Tracker Option
 361         {
 362                 HLTError("Unknown Option: %s", OptionName);
 363                 return(1);
 364         }
 365         //No Options used at the moment
 366         //return(0);
 367 }
 368
 369 #ifdef HLTCA_STANDALONE
 370 void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
 371 {
 372   //Run Performance Query for timer i of slice iSlice
 373   if (fDebugLevel >= 1)
 374   {
 375           AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
 376   }
 377 }
 378 #else
 379 void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
 380 #endif
 381
 382 void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
 383 {
 384         //Dump Rowblocks to File
 385         if (fDebugLevel >= 4)
 386         {
 387                 *fOutFile << "RowBlock Tracklets" << std::endl;
 388
 389                 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
 390                 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
 391                 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
 392                 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
 393                 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
 394                 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
 395                 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 396
 397                 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
 398                 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
 399                 {
 400                         *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
 401                                 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
 402                         for (int j = 0;j < rowBlockPos[i].x;j++)
 403                         {
 404                                 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
 405                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
 406                                 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
 407                                 {
 408                                         HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
 409                                 }
 410                                 k++;
 411                                 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
 412                                 {
 413                                         HLTError("Error, -1 Tracklet found");
 414                                 }
 415                         }
 416                         *fOutFile << std::endl << "Phase 2: ";
 417                         for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
 418                         {
 419                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
 420                         }
 421                         *fOutFile << std::endl;
 422                 }
 423
 424                 if (check)
 425                 {
 426                         *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
 427                         for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
 428                         {
 429                                 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
 430                         }
 431                 }
 432
 433                 free(rowBlockPos);
 434                 free(rowBlockTracklets);
 435                 free(blockStartingTracklet);
 436         }
 437 }
 438
 439 __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
 440 {
 441         //Initialize GPU RowBlocks and HitWeights
 442         int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
 443         int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
 444         const int stride = blockDim.x * gridDim.x;
 445         int4 i0, i1;
 446         i0.x = i0.y = i0.z = i0.w = 0;
 447         i1.x = i1.y = i1.z = i1.w = -1;
 448         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
 449                 RowBlockPos[i] = i0;
 450         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
 451                 rowBlockTracklets4[i] = i1;
 452         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
 453                 sliceDataHitWeights4[i] = i0;
 454 }
 455
 456 int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
 457 {
 458         //Primary reconstruction function
 459         cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 460
 461         if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
 462
 463         if (!fCudaInitialized)
 464         {
 465             HLTError("GPUTracker not initialized");
 466             return(1);
 467         }
 468         if (sliceCountLocal > fSliceCount)
 469         {
 470             HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
 471             return(1);
 472         }
 473         if (fThreadId != GetThread())
 474         {
 475             HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());
 476             return(1);
 477         }
 478
 479         if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
 480
 481         if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
 482         {
 483                 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
 484                 return(1);
 485         }
 486
 487         if (fDebugLevel >= 4)
 488         {
 489                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 490                 {
 491                         *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
 492                 }
 493         }
 494
 495         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
 496
 497         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
 498
 499 #ifdef HLTCA_GPU_TIME_PROFILE
 500         unsigned __int64 a, b, c, d;
 501         AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
 502         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
 503 #endif
 504
 505         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 506         {
 507                 //Make this a GPU Tracker
 508                 fGpuTracker[iSlice].SetGPUTracker();
 509                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
 510                 fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
 511                 fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
 512
 513                 //Set Pointers to GPU Memory
 514                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
 515
 516                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
 517                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
 518                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 519
 520                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
 521                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
 522                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 523
 524                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
 525                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
 526                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 527
 528                 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
 529                 {
 530                         HLTError("Insufficiant Track Memory");
 531                         return(1);
 532                 }
 533
 534                 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
 535                 {
 536                         HLTError("Insufficiant Global Memory");
 537                         return(1);
 538                 }
 539
 540                 //Initialize Startup Constants
 541                 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
 542                 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
 543                 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
 544                 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
 545                 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
 546                 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
 547                 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
 548                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
 549                 fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
 550         }
 551
 552 #ifdef HLTCA_GPU_TEXTURE_FETCH
 553                 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
 554                 size_t offset;
 555                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 556                 {
 557                         HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
 558                         return(1);
 559                 }
 560                 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
 561                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 562                 {
 563                         HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
 564                         return(1);
 565                 }
 566                 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
 567                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 568                 {
 569                         HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
 570                         return(1);
 571                 }
 572 #endif
 573
 574         //Copy Tracker Object to GPU Memory
 575         if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
 576 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
 577         if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1);
 578         CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000));
 579 #endif
 580         CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
 581
 582         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 583         {
 584                 StandalonePerfTime(firstSlice + iSlice, 0);
 585
 586                 //Initialize GPU Slave Tracker
 587                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
 588                 fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
 589 #ifdef HLTCA_GPU_TIME_PROFILE
 590                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
 591 #endif
 592                 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
 593 #ifdef HLTCA_GPU_TIME_PROFILE
 594                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
 595                 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
 596 #endif
 597                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
 598                 {
 599                         HLTError("Insufficiant Slice Data Memory");
 600                         return(1);
 601                 }
 602
 603                 /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice())
 604                 {
 605                         if (fDebugLevel >= 3) HLTInfo("Slice Empty, not running GPU Tracker");
 606                         if (sliceCountLocal == 1)
 607                                 return(0);
 608                 }*/
 609
 610                 //Initialize temporary memory where needed
 611                 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
 612                 PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
 613
 614                 //Copy Data to GPU Global Memory
 615                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 616                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 617                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 618
 619                 if (fDebugLevel >= 4)
 620                 {
 621                         if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
 622                         fSlaveTrackers[firstSlice + iSlice].TrackletMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] );
 623                         fSlaveTrackers[firstSlice + iSlice].SetPointersTracklets( HLTCA_GPU_MAX_TRACKLETS );
 624                         fSlaveTrackers[firstSlice + iSlice].HitMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100] );
 625                         fSlaveTrackers[firstSlice + iSlice].SetPointersHits( pClusterData[iSlice].NumberOfClusters() );
 626                 }
 627
 628                 if (CUDASync("Initialization")) return(1);
 629                 StandalonePerfTime(firstSlice + iSlice, 1);
 630
 631                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
 632                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 633
 634                 if (CUDASync("Neighbours finder")) return 1;
 635
 636                 StandalonePerfTime(firstSlice + iSlice, 2);
 637
 638                 if (fDebugLevel >= 4)
 639                 {
 640                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
 641                         fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
 642                 }
 643
 644                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
 645                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 646                 if (CUDASync("Neighbours Cleaner")) return 1;
 647
 648                 StandalonePerfTime(firstSlice + iSlice, 3);
 649
 650                 if (fDebugLevel >= 4)
 651                 {
 652                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
 653                         fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
 654                 }
 655
 656                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
 657                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 658                 if (CUDASync("Start Hits Finder")) return 1;
 659
 660                 StandalonePerfTime(firstSlice + iSlice, 4);
 661
 662                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter");
 663                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 664                 if (CUDASync("Start Hits Sorter")) return 1;
 665
 666                 StandalonePerfTime(firstSlice + iSlice, 5);
 667
 668                 if (fDebugLevel >= 2)
 669                 {
 670                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 671                         if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
 672                         if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
 673                         {
 674                                 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
 675                                 return(1);
 676                         }
 677                 }
 678
 679                 if (fDebugLevel >= 4)
 680                 {
 681                         *fOutFile << "Temporary ";
 682                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
 683                         fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
 684                         uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
 685                         CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
 686                         *fOutFile << "Start Hits Sort Vector:" << std::endl;
 687                         for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
 688                         {
 689                                 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
 690                         }
 691                         free(tmpMemory);
 692
 693                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
 694                         fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
 695                 }
 696
 697                 StandalonePerfTime(firstSlice + iSlice, 6);
 698
 699                 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
 700         }
 701
 702         StandalonePerfTime(firstSlice, 7);
 703 #ifdef HLTCA_GPU_PREFETCHDATA
 704         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 705         {
 706                 if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
 707                 {
 708                         HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
 709                         return(1);
 710                 }
 711                 if (fDebugLevel >= 1)
 712                 {
 713                         static int infoShown = 0;
 714                         if (!infoShown)
 715                         {
 716                                 HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
 717                                 infoShown = 1;
 718                         }
 719                 }
 720         }
 721 #endif
 722
 723         int nHardCollisions = 0;
 724
 725 RestartTrackletConstructor:
 726         if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
 727         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 728         {
 729                 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
 730                 if (CUDASync("Tracklet Initializer")) return 1;
 731                 DumpRowBlocks(fSlaveTrackers, iSlice);
 732         }
 733
 734         if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
 735         AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
 736         if (CUDASync("Tracklet Constructor (new)")) return 1;
 737
 738         StandalonePerfTime(firstSlice, 8);
 739
 740         if (fDebugLevel >= 4)
 741         {
 742                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 743                 {
 744                         DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
 745                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 746                         if (fDebugLevel >= 5)
 747                         {
 748                                 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
 749                         }
 750                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
 751                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
 752                         fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
 753                 }
 754         }
 755
 756         int runSlices = 0;
 757         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
 758         {
 759                 if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
 760                 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
 761                 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
 762         }
 763         if (CUDASync("Tracklet Selector")) return 1;
 764         StandalonePerfTime(firstSlice, 9);
 765
 766         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0]));
 767         for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++)
 768         {
 769                 if (iSliceTmp < sliceCountLocal)
 770                 {
 771                         int iSlice = iSliceTmp;
 772                         if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
 773                         cudaStreamSynchronize(cudaStreams[iSlice]);
 774                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
 775                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
 776                         if (iSlice + 1 < sliceCountLocal)
 777                                 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1]));
 778                 }
 779
 780                 if (iSliceTmp)
 781                 {
 782                         int iSlice = iSliceTmp - 1;
 783                         cudaStreamSynchronize(cudaStreams[iSlice]);
 784
 785                         if (fDebugLevel >= 4)
 786                         {
 787                                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
 788                                 fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
 789                                 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
 790                         }
 791
 792                         if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
 793                         {
 794                                 if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION && nHardCollisions++ < 10)
 795                                 {
 796                                         HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");
 797                                         for (int i = 0;i < sliceCountLocal;i++)
 798                                         {
 799                                                 cudaThreadSynchronize();
 800                                                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
 801                                                 *fSlaveTrackers[firstSlice + i].NTracks() = 0;
 802                                                 *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
 803                                                 fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
 804                                                 CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
 805                                                 PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
 806                                         }
 807                                         goto RestartTrackletConstructor;
 808                                 }
 809                                 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
 810                                 return(1);
 811                         }
 812                         if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
 813
 814                         fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
 815 #ifdef HLTCA_GPU_TIME_PROFILE
 816                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
 817 #endif
 818                         fSlaveTrackers[firstSlice + iSlice].WriteOutput();
 819 #ifdef HLTCA_GPU_TIME_PROFILE
 820                         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
 821                         printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
 822 #endif
 823
 824                         if (fDebugLevel >= 4)
 825                         {
 826                                 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
 827                                 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
 828                         }
 829                 }
 830         }
 831
 832         StandalonePerfTime(firstSlice, 10);
 833
 834         if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
 835
 836 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
 837         char* stageAtSync = (char*) malloc(100000000);
 838         CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
 839         cudaFree(fGpuTracker[0].fStageAtSync);
 840
 841         FILE* fp = fopen("profile.txt", "w+");
 842         FILE* fp2 = fopen("profile.bmp", "w+b");
 843         int nEmptySync = 0, fEmpty;
 844
 845         const int bmpheight = 1000;
 846         BITMAPFILEHEADER bmpFH;
 847         BITMAPINFOHEADER bmpIH;
 848         ZeroMemory(&bmpFH, sizeof(bmpFH));
 849         ZeroMemory(&bmpIH, sizeof(bmpIH));
 850
 851         bmpFH.bfType = 19778; //"BM"
 852         bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
 853         bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
 854
 855         bmpIH.biSize = sizeof(bmpIH);
 856         bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
 857         bmpIH.biHeight = bmpheight;
 858         bmpIH.biPlanes = 1;
 859         bmpIH.biBitCount = 32;
 860
 861         fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
 862         fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
 863
 864         for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
 865         {
 866                 fEmpty = 1;
 867                 for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
 868                 {
 869                         fprintf(fp, "%d\t", stageAtSync[i + j]);
 870                         int color = 0;
 871                         if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
 872                         if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
 873                         if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
 874                         if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
 875                         fwrite(&color, 1, sizeof(int), fp2);
 876                         if (j > 0 && j % 32 == 0)
 877                         {
 878                                 color = RGB(255, 255, 255);
 879                                 fwrite(&color, 1, 4, fp2);
 880                         }
 881                         if (stageAtSync[i + j]) fEmpty = 0;
 882                 }
 883                 fprintf(fp, "\n");
 884                 if (fEmpty) nEmptySync++;
 885                 else nEmptySync = 0;
 886                 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
 887         }
 888
 889         fclose(fp);
 890         fclose(fp2);
 891         free(stageAtSync);
 892 #endif
 893
 894         return(0);
 895 }
 896
 897 int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
 898 {
 899         //Initialize Slice Tracker Parameter for a slave tracker
 900         fSlaveTrackers[iSlice].Initialize(param);
 901         if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
 902         {
 903                 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
 904                 return(1);
 905         }
 906         return(0);
 907 }
 908
 909 int AliHLTTPCCAGPUTracker::ExitGPU()
 910 {
 911         //Uninitialize CUDA
 912         cudaThreadSynchronize();
 913         if (fGPUMemory)
 914         {
 915                 cudaFree(fGPUMemory);
 916                 fGPUMemory = NULL;
 917         }
 918         if (fHostLockedMemory)
 919         {
 920                 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
 921                 {
 922                         cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
 923                 }
 924                 free(fpCudaStreams);
 925                 fGpuTracker = NULL;
 926                 cudaFreeHost(fHostLockedMemory);
 927         }
 928
 929         if (CudaFailedMsg(cudaThreadExit()))
 930         {
 931                 HLTError("Could not uninitialize GPU");
 932                 return(1);
 933         }
 934         HLTInfo("CUDA Uninitialized");
 935         fgGPUUsed = false;
 936         fCudaInitialized = 0;
 937         return(0);
 938 }
 939
 940 void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
 941 {
 942         fOutputControl = val;
 943         for (int i = 0;i < fgkNSlices;i++)
 944         {
 945                 fSlaveTrackers[i].SetOutputControl(val);
 946         }
 947 }
 948
 949 int AliHLTTPCCAGPUTracker::GetThread()
 950 {
 951 #ifdef R__WIN32
 952         return((int) (size_t) GetCurrentThread());
 953 #else
 954         return((int) syscall (SYS_gettid));
 955 #endif
 956 }
 957
 958 #endif