HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu

   1 // **************************************************************************
   2 // This file is property of and copyright by the ALICE HLT Project          *
   3 // ALICE Experiment at CERN, All rights reserved.                           *
   4 //                                                                          *
   5 // Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
   6 //                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
   7 //                                      David Rohr <drohr@kip.uni-heidelberg.de>                                *
   8 //                  for The ALICE HLT Project.                              *
   9 //                                                                          *
  10 // Permission to use, copy, modify and distribute this software and its     *
  11 // documentation strictly for non-commercial purposes is hereby granted     *
  12 // without fee, provided that the above copyright notice appears in all     *
  13 // copies and that both the copyright notice and this permission notice     *
  14 // appear in the supporting documentation. The authors make no claims       *
  15 // about the suitability of this software for any purpose. It is            *
  16 // provided "as is" without express or implied warranty.                    *
  17 //                                                                          *
  18 //***************************************************************************
  19
  20 #include "AliHLTTPCCAGPUTracker.h"
  21
  22 #ifdef BUILD_GPU
  23
  24 #include "AliHLTTPCCADef.h"
  25 #include "AliHLTTPCCAGPUConfig.h"
  26
  27 #include <sm_11_atomic_functions.h>
  28 #include <sm_12_atomic_functions.h>
  29
  30 #include <iostream>
  31
  32 //Disable assertions since they produce errors in GPU Code
  33 #ifdef assert
  34 #undef assert
  35 #endif
  36 #define assert(param)
  37
  38 __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
  39 #ifdef HLTCA_GPU_TEXTURE_FETCH
  40 texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
  41 texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
  42 texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
  43 #endif
  44
  45 #include "AliHLTTPCCAHit.h"
  46
  47 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
  48 #include "AliHLTTPCCATrackParam.cxx"
  49 #include "AliHLTTPCCATrack.cxx"
  50
  51 #include "AliHLTTPCCATrackletSelector.cxx"
  52
  53 #include "AliHLTTPCCAHitArea.cxx"
  54 #include "AliHLTTPCCAGrid.cxx"
  55 #include "AliHLTTPCCARow.cxx"
  56 #include "AliHLTTPCCAParam.cxx"
  57 #include "AliHLTTPCCATracker.cxx"
  58
  59 #include "AliHLTTPCCAOutTrack.cxx"
  60
  61 #include "AliHLTTPCCAProcess.h"
  62
  63 #include "AliHLTTPCCANeighboursFinder.cxx"
  64
  65 #include "AliHLTTPCCANeighboursCleaner.cxx"
  66 #include "AliHLTTPCCAStartHitsFinder.cxx"
  67 #include "AliHLTTPCCAStartHitsSorter.cxx"
  68 #include "AliHLTTPCCATrackletConstructor.cxx"
  69 #include "AliHLTTPCCASliceOutput.cxx"
  70
  71 #include "MemoryAssignmentHelpers.h"
  72
  73 #ifndef HLTCA_STANDALONE
  74 #include "AliHLTDefinitions.h"
  75 #include "AliHLTSystem.h"
  76 #endif
  77
  78 ClassImp( AliHLTTPCCAGPUTracker )
  79
  80 bool AliHLTTPCCAGPUTracker::fgGPUUsed = false;
  81
  82 int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
  83 {
  84         //Find best CUDA device, initialize and allocate memory
  85
  86         if (fgGPUUsed)
  87         {
  88             HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
  89             return(1);
  90         }
  91
  92         cudaDeviceProp fCudaDeviceProp;
  93
  94 #ifndef CUDA_DEVICE_EMULATION
  95         int count, bestDevice = -1, bestDeviceSpeed = 0;
  96         if (CudaFailedMsg(cudaGetDeviceCount(&count)))
  97         {
  98                 HLTError("Error getting CUDA Device Count");
  99                 return(1);
 100         }
 101         if (fDebugLevel >= 2) std::cout << "Available CUDA devices: ";
 102         for (int i = 0;i < count;i++)
 103         {
 104                 cudaGetDeviceProperties(&fCudaDeviceProp, i);
 105                 if (fDebugLevel >= 2) std::cout << fCudaDeviceProp.name << " (" << i << ")     ";
 106                 if (fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate > bestDeviceSpeed)
 107                 {
 108                         bestDevice = i;
 109                         bestDeviceSpeed = fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate;
 110                 }
 111         }
 112         if (fDebugLevel >= 2) std::cout << std::endl;
 113
 114         if (bestDevice == -1)
 115         {
 116                 HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
 117                 return(1);
 118         }
 119
 120   int cudaDevice;
 121   if (forceDeviceID == -1)
 122           cudaDevice = bestDevice;
 123   else
 124           cudaDevice = forceDeviceID;
 125 #else
 126         int cudaDevice = 0;
 127 #endif
 128
 129   cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice );
 130
 131   if (fDebugLevel >= 1)
 132   {
 133           std::cout<<"CUDA Device Properties: "<<std::endl;
 134           std::cout<<"name = "<<fCudaDeviceProp.name<<std::endl;
 135           std::cout<<"totalGlobalMem = "<<fCudaDeviceProp.totalGlobalMem<<std::endl;
 136           std::cout<<"sharedMemPerBlock = "<<fCudaDeviceProp.sharedMemPerBlock<<std::endl;
 137           std::cout<<"regsPerBlock = "<<fCudaDeviceProp.regsPerBlock<<std::endl;
 138           std::cout<<"warpSize = "<<fCudaDeviceProp.warpSize<<std::endl;
 139           std::cout<<"memPitch = "<<fCudaDeviceProp.memPitch<<std::endl;
 140           std::cout<<"maxThreadsPerBlock = "<<fCudaDeviceProp.maxThreadsPerBlock<<std::endl;
 141           std::cout<<"maxThreadsDim = "<<fCudaDeviceProp.maxThreadsDim[0]<<" "<<fCudaDeviceProp.maxThreadsDim[1]<<" "<<fCudaDeviceProp.maxThreadsDim[2]<<std::endl;
 142           std::cout<<"maxGridSize = "  <<fCudaDeviceProp.maxGridSize[0]<<" "<<fCudaDeviceProp.maxGridSize[1]<<" "<<fCudaDeviceProp.maxGridSize[2]<<std::endl;
 143           std::cout<<"totalConstMem = "<<fCudaDeviceProp.totalConstMem<<std::endl;
 144           std::cout<<"major = "<<fCudaDeviceProp.major<<std::endl;
 145           std::cout<<"minor = "<<fCudaDeviceProp.minor<<std::endl;
 146           std::cout<<"clockRate = "<<fCudaDeviceProp.clockRate<<std::endl;
 147           std::cout<<"textureAlignment = "<<fCudaDeviceProp.textureAlignment<<std::endl;
 148   }
 149
 150   if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
 151   {
 152         HLTError( "Unsupported CUDA Device" );
 153           return(1);
 154   }
 155
 156   if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
 157   {
 158           HLTError("Could not set CUDA Device!");
 159           return(1);
 160   }
 161
 162   if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
 163   {
 164           HLTError("Insufficiant Common Memory");
 165           cudaThreadExit();
 166           return(1);
 167   }
 168
 169   if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
 170   {
 171           HLTError("Insufficiant Row Memory");
 172           cudaThreadExit();
 173           return(1);
 174   }
 175
 176   fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
 177   if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
 178   {
 179           HLTError("CUDA Memory Allocation Error");
 180           cudaThreadExit();
 181           return(1);
 182   }
 183   if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
 184   int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
 185   if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
 186   {
 187           cudaFree(fGPUMemory);
 188           cudaThreadExit();
 189           HLTError("Error allocating Page Locked Host Memory");
 190           return(1);
 191   }
 192   if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
 193
 194   if (fDebugLevel >= 1)
 195   {
 196           CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
 197   }
 198   HLTInfo("CUDA Initialisation successfull");
 199
 200   //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
 201   if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
 202   {
 203           HLTError("Insufficiant Tracker Object Memory");
 204           return(1);
 205   }
 206   fSliceCount = sliceCount;
 207   fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
 208
 209   for (int i = 0;i < fgkNSlices;i++)
 210   {
 211     fSlaveTrackers[i].SetGPUTracker();
 212         fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
 213         fSlaveTrackers[i].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
 214   }
 215
 216   fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
 217   cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 218   for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
 219   {
 220         if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
 221         {
 222                 HLTError("Error creating CUDA Stream");
 223                 return(1);
 224         }
 225   }
 226
 227 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
 228   if (fDebugLevel < 2)
 229   {
 230           //Do one initial run for Benchmark reasons
 231           const int useDebugLevel = fDebugLevel;
 232           fDebugLevel = 0;
 233           AliHLTTPCCAClusterData tmpCluster;
 234
 235           std::ifstream fin;
 236           fin.open("events/event.0.dump");
 237           tmpCluster.ReadEvent(fin);
 238           fin.close();
 239
 240           AliHLTTPCCASliceOutput *tmpOutput = NULL;
 241           AliHLTTPCCAParam tmpParam;
 242           AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
 243           fSlaveTrackers[0].SetOutputControl(&tmpOutputControl);
 244           tmpParam.SetNRows(HLTCA_ROW_COUNT);
 245           fSlaveTrackers[0].SetParam(tmpParam);
 246           Reconstruct(&tmpOutput, &tmpCluster, 0, 1);
 247           free(tmpOutput);
 248           tmpOutput = NULL;
 249           fSlaveTrackers[0].SetOutputControl(NULL);
 250           fDebugLevel = useDebugLevel;
 251   }
 252 #endif
 253     fgGPUUsed = true;
 254   return(0);
 255 }
 256
 257 template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment)
 258 {
 259         //Macro to align Pointers.
 260         //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
 261         //(As long as every single data structure is <= 1 MB)
 262
 263         size_t adr = (size_t) ptr;
 264         if (adr % alignment)
 265         {
 266                 adr += alignment - (adr % alignment);
 267         }
 268         return((T*) adr);
 269 }
 270
 271 bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
 272 {
 273         //Check for CUDA Error and in the case of an error display the corresponding error string
 274         if (error == cudaSuccess) return(false);
 275         HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
 276         return(true);
 277 }
 278
 279 int AliHLTTPCCAGPUTracker::CUDASync(char* state)
 280 {
 281         //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
 282
 283         if (fDebugLevel == 0) return(0);
 284         cudaError cuErr;
 285         cuErr = cudaGetLastError();
 286         if (cuErr != cudaSuccess)
 287         {
 288                 HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state);
 289                 return(1);
 290         }
 291         if (CudaFailedMsg(cudaThreadSynchronize()))
 292         {
 293                 HLTError("CUDA Error while synchronizing (%s)", state);
 294                 return(1);
 295         }
 296         if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");
 297         return(0);
 298 }
 299
 300 void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
 301 {
 302         //Set Debug Level and Debug output File if applicable
 303         fDebugLevel = dwLevel;
 304         if (NewOutFile) fOutFile = NewOutFile;
 305 }
 306
 307 int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
 308 {
 309         //Set a specific GPU Tracker Option
 310         {
 311                 HLTError("Unknown Option: %s", OptionName);
 312                 return(1);
 313         }
 314         //No Options used at the moment
 315         //return(0);
 316 }
 317
 318 #ifdef HLTCA_STANDALONE
 319 void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
 320 {
 321   //Run Performance Query for timer i of slice iSlice
 322   if (fDebugLevel >= 1)
 323   {
 324           AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
 325   }
 326 }
 327 #else
 328 void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
 329 #endif
 330
 331 void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
 332 {
 333         //Dump Rowblocks to File
 334         if (fDebugLevel >= 4)
 335         {
 336                 *fOutFile << "RowBlock Tracklets" << std::endl;
 337
 338                 int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
 339                 int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
 340                 uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
 341                 CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
 342                 CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
 343                 CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
 344                 CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 345
 346                 int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
 347                 for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
 348                 {
 349                         *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
 350                                 rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << endl << "Phase 1: ";
 351                         for (int j = 0;j < rowBlockPos[i].x;j++)
 352                         {
 353                                 //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
 354                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
 355                                 if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
 356                                 {
 357                                         HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
 358                                 }
 359                                 k++;
 360                                 if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
 361                                 {
 362                                         HLTError("Error, -1 Tracklet found");
 363                                 }
 364                         }
 365                         *fOutFile << endl << "Phase 2: ";
 366                         for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
 367                         {
 368                                 *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
 369                         }
 370                         *fOutFile << endl;
 371                 }
 372
 373                 if (check)
 374                 {
 375                         *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
 376                         for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
 377                         {
 378                                 *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
 379                         }
 380                 }
 381
 382                 free(rowBlockPos);
 383                 free(rowBlockTracklets);
 384                 free(blockStartingTracklet);
 385         }
 386 }
 387
 388 __global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
 389 {
 390         //Initialize GPU RowBlocks and HitWeights
 391         int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
 392         int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
 393         const int stride = blockDim.x * gridDim.x;
 394         int4 i0, i1;
 395         i0.x = i0.y = i0.z = i0.w = 0;
 396         i1.x = i1.y = i1.z = i1.w = -1;
 397         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
 398                 RowBlockPos[i] = i0;
 399         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
 400                 rowBlockTracklets4[i] = i1;
 401         for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
 402                 sliceDataHitWeights4[i] = i0;
 403 }
 404
 405 int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
 406 {
 407         //Primary reconstruction function
 408         cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 409
 410         if (sliceCountLocal == -1) sliceCountLocal = this->fSliceCount;
 411
 412         if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
 413         {
 414                 HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
 415                 return(1);
 416         }
 417
 418         if (fDebugLevel >= 4)
 419         {
 420                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 421                 {
 422                         *fOutFile << endl << endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << endl;
 423                 }
 424         }
 425
 426         memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
 427
 428         if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice + sliceCountLocal].Param().ISlice());
 429         if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
 430
 431 #ifdef HLTCA_GPU_TIME_PROFILE
 432         __int64 a, b, c, d;
 433         QueryPerformanceFrequency((LARGE_INTEGER*) &c);
 434         QueryPerformanceCounter((LARGE_INTEGER*) &d);
 435 #endif
 436
 437         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 438         {
 439                 //Make this a GPU Tracker
 440                 fGpuTracker[iSlice].SetGPUTracker();
 441                 fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
 442                 fGpuTracker[iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
 443                 fGpuTracker[iSlice].pData()->SetPointers(&pClusterData[iSlice], false);
 444
 445                 //Set Pointers to GPU Memory
 446                 char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
 447
 448                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");
 449                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
 450                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 451
 452                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");
 453                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
 454                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 455
 456                 if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");
 457                 tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
 458                 tmpMem = alignPointer(tmpMem, 1024 * 1024);
 459
 460                 if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
 461                 {
 462                         HLTError("Insufficiant Track Memory");
 463                         return(1);
 464                 }
 465
 466                 if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
 467                 {
 468                         HLTError("Insufficiant Global Memory");
 469                         return(1);
 470                 }
 471
 472                 //Initialize Startup Constants
 473                 *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
 474                 *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
 475                 *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
 476                 fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
 477                 if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
 478                 fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
 479                 fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
 480                 fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
 481                 fGpuTracker[iSlice].pData()->SetGPUTextureBase(fGpuTracker[0].Data().Memory());
 482         }
 483
 484 #ifdef HLTCA_GPU_TEXTURE_FETCH
 485                 cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();
 486                 size_t offset;
 487                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 488                 {
 489                         HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
 490                         return(1);
 491                 }
 492                 cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
 493                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 494                 {
 495                         HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
 496                         return(1);
 497                 }
 498                 cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
 499                 if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
 500                 {
 501                         HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
 502                         return(1);
 503                 }
 504 #endif
 505
 506         //Copy Tracker Object to GPU Memory
 507         if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");
 508 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
 509         if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1);
 510         CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000));
 511 #endif
 512         CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
 513
 514         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 515         {
 516                 StandalonePerfTime(firstSlice + iSlice, 0);
 517
 518                 //Initialize GPU Slave Tracker
 519                 if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
 520                 fSlaveTrackers[firstSlice + iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
 521 #ifdef HLTCA_GPU_TIME_PROFILE
 522                 QueryPerformanceCounter((LARGE_INTEGER*) &a);
 523 #endif
 524                 fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
 525 #ifdef HLTCA_GPU_TIME_PROFILE
 526                 QueryPerformanceCounter((LARGE_INTEGER*) &b);
 527                 printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
 528 #endif
 529                 if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
 530                 {
 531                         HLTError("Insufficiant Slice Data Memory");
 532                         return(1);
 533                 }
 534
 535                 /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice())
 536                 {
 537                         if (fDebugLevel >= 3) HLTInfo("Slice Empty, not running GPU Tracker");
 538                         if (sliceCountLocal == 1)
 539                                 return(0);
 540                 }*/
 541
 542                 //Initialize temporary memory where needed
 543                 if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");
 544                 PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
 545
 546                 //Copy Data to GPU Global Memory
 547                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 548                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 549                 CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
 550
 551                 if (fDebugLevel >= 4)
 552                 {
 553                         if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
 554                         fSlaveTrackers[firstSlice + iSlice].TrackletMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] );
 555                         fSlaveTrackers[firstSlice + iSlice].SetPointersTracklets( HLTCA_GPU_MAX_TRACKLETS );
 556                         fSlaveTrackers[firstSlice + iSlice].HitMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100] );
 557                         fSlaveTrackers[firstSlice + iSlice].SetPointersHits( pClusterData[iSlice].NumberOfClusters() );
 558                 }
 559
 560                 if (CUDASync("Initialization")) return(1);
 561                 StandalonePerfTime(firstSlice + iSlice, 1);
 562
 563                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");
 564                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 565
 566                 if (CUDASync("Neighbours finder")) return 1;
 567
 568                 StandalonePerfTime(firstSlice + iSlice, 2);
 569
 570                 if (fDebugLevel >= 4)
 571                 {
 572                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
 573                         fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
 574                 }
 575
 576                 if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");
 577                 AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 578                 if (CUDASync("Neighbours Cleaner")) return 1;
 579
 580                 StandalonePerfTime(firstSlice + iSlice, 3);
 581
 582                 if (fDebugLevel >= 4)
 583                 {
 584                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
 585                         fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
 586                 }
 587
 588                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");
 589                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 590                 if (CUDASync("Start Hits Finder")) return 1;
 591
 592                 StandalonePerfTime(firstSlice + iSlice, 4);
 593
 594                 if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter");
 595                 AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
 596                 if (CUDASync("Start Hits Sorter")) return 1;
 597
 598                 StandalonePerfTime(firstSlice + iSlice, 5);
 599
 600                 if (fDebugLevel >= 2)
 601                 {
 602                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 603                         if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
 604                         if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
 605                         {
 606                                 HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
 607                                 return(1);
 608                         }
 609                 }
 610
 611                 if (fDebugLevel >= 4)
 612                 {
 613                         *fOutFile << "Temporary ";
 614                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
 615                         fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
 616                         uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
 617                         CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
 618                         *fOutFile << "Start Hits Sort Vector:" << std::endl;
 619                         for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
 620                         {
 621                                 *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
 622                         }
 623                         free(tmpMemory);
 624
 625                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
 626                         fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
 627                 }
 628
 629                 StandalonePerfTime(firstSlice + iSlice, 6);
 630
 631                 fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
 632         }
 633
 634         StandalonePerfTime(firstSlice, 7);
 635 #ifdef HLTCA_GPU_PREFETCHDATA
 636         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 637         {
 638                 if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
 639                 {
 640                         HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
 641                         return(1);
 642                 }
 643                 if (fDebugLevel >= 1)
 644                 {
 645                         static int infoShown = 0;
 646                         if (!infoShown)
 647                         {
 648                                 HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
 649                                 infoShown = 1;
 650                         }
 651                 }
 652         }
 653 #endif
 654
 655         if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
 656         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 657         {
 658                 AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
 659                 if (CUDASync("Tracklet Initializer")) return 1;
 660                 DumpRowBlocks(fSlaveTrackers, iSlice);
 661         }
 662
 663         if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");
 664         AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
 665         if (CUDASync("Tracklet Constructor (new)")) return 1;
 666
 667         StandalonePerfTime(firstSlice, 8);
 668
 669         if (fDebugLevel >= 4)
 670         {
 671                 for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
 672                 {
 673                         DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
 674                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
 675                         if (fDebugLevel >= 5)
 676                         {
 677                                 HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
 678                         }
 679                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
 680                         CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
 681                         fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
 682                 }
 683         }
 684
 685         for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT)
 686         {
 687                 if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT);
 688                 AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT, sliceCountLocal - iSlice));
 689         }
 690         if (CUDASync("Tracklet Selector")) return 1;
 691         StandalonePerfTime(firstSlice, 9);
 692
 693         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0]));
 694         for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++)
 695         {
 696                 if (iSliceTmp < sliceCountLocal)
 697                 {
 698                         int iSlice = iSliceTmp;
 699                         if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");
 700                         cudaStreamSynchronize(cudaStreams[iSlice]);
 701                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
 702                         CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
 703                         if (iSlice + 1 < sliceCountLocal)
 704                                 CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1]));
 705                 }
 706
 707                 if (iSliceTmp)
 708                 {
 709                         int iSlice = iSliceTmp - 1;
 710                         cudaStreamSynchronize(cudaStreams[iSlice]);
 711
 712                         if (fDebugLevel >= 4)
 713                         {
 714                                 CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
 715                                 fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
 716                                 fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
 717                         }
 718
 719                         if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
 720                         {
 721                                 HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
 722                                 return(1);
 723                         }
 724                         if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
 725
 726                         fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
 727 #ifdef HLTCA_GPU_TIME_PROFILE
 728                         QueryPerformanceCounter((LARGE_INTEGER*) &a);
 729 #endif
 730                         fSlaveTrackers[firstSlice + iSlice].WriteOutput();
 731 #ifdef HLTCA_GPU_TIME_PROFILE
 732                         QueryPerformanceCounter((LARGE_INTEGER*) &b);
 733                         printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
 734 #endif
 735
 736                         if (fDebugLevel >= 4)
 737                         {
 738                                 delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
 739                                 delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
 740                         }
 741                 }
 742         }
 743
 744         StandalonePerfTime(firstSlice, 10);
 745
 746         if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");
 747
 748 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
 749         char* stageAtSync = (char*) malloc(100000000);
 750         CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
 751         cudaFree(fGpuTracker[0].fStageAtSync);
 752
 753         FILE* fp = fopen("profile.txt", "w+");
 754         FILE* fp2 = fopen("profile.bmp", "w+b");
 755         int nEmptySync = 0, fEmpty;
 756
 757         const int bmpheight = 1000;
 758         BITMAPFILEHEADER bmpFH;
 759         BITMAPINFOHEADER bmpIH;
 760         ZeroMemory(&bmpFH, sizeof(bmpFH));
 761         ZeroMemory(&bmpIH, sizeof(bmpIH));
 762
 763         bmpFH.bfType = 19778; //"BM"
 764         bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
 765         bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
 766
 767         bmpIH.biSize = sizeof(bmpIH);
 768         bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
 769         bmpIH.biHeight = bmpheight;
 770         bmpIH.biPlanes = 1;
 771         bmpIH.biBitCount = 32;
 772
 773         fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
 774         fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);
 775
 776         for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
 777         {
 778                 fEmpty = 1;
 779                 for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
 780                 {
 781                         fprintf(fp, "%d\t", stageAtSync[i + j]);
 782                         int color = 0;
 783                         if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
 784                         if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
 785                         if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
 786                         if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
 787                         fwrite(&color, 1, sizeof(int), fp2);
 788                         if (j > 0 && j % 32 == 0)
 789                         {
 790                                 color = RGB(255, 255, 255);
 791                                 fwrite(&color, 1, 4, fp2);
 792                         }
 793                         if (stageAtSync[i + j]) fEmpty = 0;
 794                 }
 795                 fprintf(fp, "\n");
 796                 if (fEmpty) nEmptySync++;
 797                 else nEmptySync = 0;
 798                 //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
 799         }
 800
 801         fclose(fp);
 802         fclose(fp2);
 803         free(stageAtSync);
 804 #endif
 805
 806         return(0);
 807 }
 808
 809 int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
 810 {
 811         //Initialize Slice Tracker Parameter for a slave tracker
 812         fSlaveTrackers[iSlice].Initialize(param);
 813         if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
 814         {
 815                 HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
 816                 return(1);
 817         }
 818         return(0);
 819 }
 820
 821 int AliHLTTPCCAGPUTracker::ExitGPU()
 822 {
 823         //Uninitialize CUDA
 824         cudaThreadSynchronize();
 825         if (fGPUMemory)
 826         {
 827                 cudaFree(fGPUMemory);
 828                 fGPUMemory = NULL;
 829         }
 830         if (fHostLockedMemory)
 831         {
 832                 for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
 833                 {
 834                         cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
 835                 }
 836                 free(fpCudaStreams);
 837                 fGpuTracker = NULL;
 838                 cudaFreeHost(fHostLockedMemory);
 839         }
 840
 841         if (CudaFailedMsg(cudaThreadExit()))
 842         {
 843                 HLTError("Could not uninitialize GPU");
 844                 return(1);
 845         }
 846         HLTInfo("CUDA Uninitialized");
 847         fgGPUUsed = false;
 848         return(0);
 849 }
 850
 851 void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
 852 {
 853         fOutputControl = val;
 854         for (int i = 0;i < fgkNSlices;i++)
 855         {
 856                 fSlaveTrackers[i].SetOutputControl(val);
 857         }
 858 }
 859
 860 #endif