From b22af1bf862f20470b035855e5861933a18996d4 Mon Sep 17 00:00:00 2001 From: sgorbuno Date: Thu, 1 Oct 2009 13:35:10 +0000 Subject: [PATCH] Update of the GPU tracker from David Rohr --- .../tracking-ca/AliHLTTPCCAClusterData.cxx | 5 +- .../tracking-ca/AliHLTTPCCAClusterData.h | 10 +- HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h | 45 +- HLT/TPCLib/tracking-ca/AliHLTTPCCADisplay.cxx | 4 +- HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUConfig.h | 73 ++ .../tracking-ca/AliHLTTPCCAGPUTracker.cu | 845 ++++++++++++---- .../AliHLTTPCCAGPUTracker.cu.patch | 11 + .../tracking-ca/AliHLTTPCCAGPUTracker.cxx | 37 + .../tracking-ca/AliHLTTPCCAGPUTracker.h | 69 +- HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.cxx | 3 +- HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.h | 5 +- HLT/TPCLib/tracking-ca/AliHLTTPCCAHit.h | 3 - HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.cxx | 20 +- HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.h | 3 - HLT/TPCLib/tracking-ca/AliHLTTPCCAHitId.h | 2 - HLT/TPCLib/tracking-ca/AliHLTTPCCAMerger.cxx | 4 +- .../AliHLTTPCCANeighboursCleaner.h | 3 - .../AliHLTTPCCANeighboursFinder.cxx | 99 +- .../tracking-ca/AliHLTTPCCANeighboursFinder.h | 13 +- HLT/TPCLib/tracking-ca/AliHLTTPCCAParam.h | 3 - .../tracking-ca/AliHLTTPCCAPerformance.cxx | 31 +- HLT/TPCLib/tracking-ca/AliHLTTPCCAProcess.h | 37 +- HLT/TPCLib/tracking-ca/AliHLTTPCCARow.h | 3 - .../tracking-ca/AliHLTTPCCASliceData.cxx | 119 ++- HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h | 106 +- .../tracking-ca/AliHLTTPCCASliceOutput.cxx | 37 +- .../tracking-ca/AliHLTTPCCASliceOutput.h | 51 +- .../AliHLTTPCCAStandaloneFramework.cxx | 134 +-- .../AliHLTTPCCAStandaloneFramework.h | 55 +- .../AliHLTTPCCAStartHitsFinder.cxx | 27 +- .../tracking-ca/AliHLTTPCCAStartHitsFinder.h | 3 - .../AliHLTTPCCAStartHitsSorter.cxx | 111 +++ .../tracking-ca/AliHLTTPCCAStartHitsSorter.h | 50 + HLT/TPCLib/tracking-ca/AliHLTTPCCATrack.h | 3 - .../AliHLTTPCCATrackLinearisation.h | 3 - .../tracking-ca/AliHLTTPCCATrackParam.h | 7 +- HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx | 473 +++++---- HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h | 180 ++-- .../AliHLTTPCCATrackerComponent.cxx | 40 +- .../tracking-ca/AliHLTTPCCATrackerComponent.h | 6 +- .../AliHLTTPCCATrackerFramework.cxx | 119 +++ .../tracking-ca/AliHLTTPCCATrackerFramework.h | 66 ++ HLT/TPCLib/tracking-ca/AliHLTTPCCATracklet.h | 20 +- .../AliHLTTPCCATrackletConstructor.cxx | 934 ++++++++++++++---- .../AliHLTTPCCATrackletConstructor.h | 78 +- .../AliHLTTPCCATrackletSelector.cxx | 84 +- .../tracking-ca/AliHLTTPCCATrackletSelector.h | 14 +- HLT/TPCLib/tracking-ca/AliTPCtrackerCA.cxx | 18 +- .../tracking-ca/MemoryAssignmentHelpers.h | 2 +- 49 files changed, 3101 insertions(+), 967 deletions(-) create mode 100644 HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUConfig.h create mode 100644 HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu.patch create mode 100644 HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cxx create mode 100644 HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.cxx create mode 100644 HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.h create mode 100644 HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx create mode 100644 HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.h diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAClusterData.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAClusterData.cxx index cf079554ece..321eccd9401 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAClusterData.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAClusterData.cxx @@ -18,6 +18,7 @@ #include "AliHLTTPCCAMath.h" #include #include "AliHLTArray.h" +#include "AliHLTTPCCAGPUConfig.h" void AliHLTTPCCAClusterData::StartReading( int sliceIndex, int guessForNumberOfClusters ) { @@ -27,8 +28,8 @@ void AliHLTTPCCAClusterData::StartReading( int sliceIndex, int guessForNumberOfC fFirstRow = 0; fLastRow = 0; fData.clear(); - fNumberOfClusters.reserve( 160 ); - fRowOffset.reserve( 160 ); + fNumberOfClusters.reserve( HLTCA_ROW_COUNT + 1 ); + fRowOffset.reserve( HLTCA_ROW_COUNT + 1 ); fData.reserve( CAMath::Max( 64, guessForNumberOfClusters ) ); } diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAClusterData.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAClusterData.h index 758b12fedf4..df9c1f69b25 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAClusterData.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAClusterData.h @@ -17,6 +17,7 @@ #ifndef ALIHLTTPCCACLUSTERDATA_H #define ALIHLTTPCCACLUSTERDATA_H +#include "AliHLTTPCCADef.h" #include #include @@ -29,7 +30,7 @@ class AliHLTTPCCAClusterData { public: - AliHLTTPCCAClusterData(): fSliceIndex( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fNumberOfClusters(), fRowOffset(), fData() {} + AliHLTTPCCAClusterData(): fSliceIndex( 0 ), fFirstRow( 0 ), fLastRow( -1 ), fNumberOfClusters(), fRowOffset(), fData() {} /** * prepare for the reading of event @@ -151,8 +152,11 @@ class AliHLTTPCCAClusterData */ void Merge( int index1, int index2 ); - - static bool CompareClusters( const Data &a, const Data &b ) { return ( a.fRow < b.fRow ); } +#ifdef REPRODUCIBLE_CLUSTER_SORTING + static bool CompareClusters( const Data &a, const Data &b ) { return ( a.fRow >= b.fRow ? (a.fId < b.fId) : (a.fRow < b.fRow) ); } +#else + static bool CompareClusters( const Data &a, const Data &b ) { return ( (a.fRow < b.fRow) ); } +#endif int fSliceIndex; // the slice index this data belongs to int fFirstRow; // see FirstRow() diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h index 88fefc64c64..9903e792baa 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h @@ -40,10 +40,18 @@ #endif #endif -#if defined(HLTCA_STANDALONE) || defined(HLTCA_GPUCODE) +#if defined(HLTCA_STANDALONE) // class TObject{}; +#ifdef ClassDef +#undef ClassDef +#endif + +#ifdef ClassTmp +#undef ClassTmp +#endif + #define ClassDef(name,id) #define ClassImp(name) @@ -96,16 +104,20 @@ namespace AliHLTTPCCADefinitions #endif +//#define EXTERN_ROW_HITS +#define TRACKLET_SELECTOR_MIN_HITS 10 +#define REPRODUCIBLE_CLUSTER_SORTING + #ifdef HLTCA_GPUCODE -#define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP 5 -#define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_FGRIDCONTENTUPDOWN 700 +#define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP 6 +#define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_FGRIDCONTENTUPDOWN 1000 #define ALIHLTTPCCASTARTHITSFINDER_MAX_FROWSTARTHITS 3500 -#define ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM 650 +#define ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM 1536 //Max amount of hits in a row that can be stored in shared memory, make sure this is divisible by ROW ALIGNMENT #else #define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP 20 #define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_FGRIDCONTENTUPDOWN 7000 #define ALIHLTTPCCASTARTHITSFINDER_MAX_FROWSTARTHITS 10000 -#define ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM 5000 +#define ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM 15000 #endif #ifdef HLTCA_GPUCODE @@ -118,8 +130,6 @@ namespace AliHLTTPCCADefinitions #define GPUshared() __shared__ #define GPUsync() __syncthreads() -__constant__ float4 gAliHLTTPCCATracker[30000/sizeof( float4 )]; - #else #define GPUd() @@ -130,9 +140,15 @@ __constant__ float4 gAliHLTTPCCATracker[30000/sizeof( float4 )]; #define GPUsync() struct float2 { float x; float y; }; -struct uchar2 { unsigned char x; unsigned char y; }; -struct ushort2 { unsigned short x; unsigned short y; }; +struct uchar2 { unsigned char x, y; }; +struct short2 { short x, y; }; +struct ushort2 { unsigned short x, y; }; +struct int2 { int x, y; }; +struct int3 { int x, y, z; }; +struct int4 { int x, y, z, w; }; struct uint1 { unsigned int x; }; +struct uint2 { unsigned int x, y; }; +struct uint3 { unsigned int x, y, z; }; struct uint4 { unsigned int x, y, z, w; }; #ifdef R__WIN32 @@ -149,6 +165,8 @@ inline bool finite(float x) /* * Helper for compile-time verification of correct API usage */ + +#ifndef HLTCA_GPUCODE namespace { template struct HLTTPCCA_STATIC_ASSERT_FAILURE; @@ -161,6 +179,9 @@ namespace typedef HLTTPCCA_STATIC_ASSERT_FAILURE HLTTPCCA_STATIC_ASSERT_CONCAT(_STATIC_ASSERTION_FAILED_##msg, __LINE__); \ HLTTPCCA_STATIC_ASSERT_CONCAT(_STATIC_ASSERTION_FAILED_##msg, __LINE__) Error_##msg; \ (void) Error_##msg +#else +#define STATIC_ASSERT(a, b) +#endif namespace { @@ -184,4 +205,10 @@ namespace void UNUSED_PARAM9( const T1 &, const T2 &, const T3 &, const T4 &, const T5 &, const T6 &, const T7 &, const T8 &, const T9 & ) {} } +#define UNROLL2(var, code) code;var++;code;var++; +#define UNROLL4(var, code) UNROLL2(var, code) UNROLL2(var, code) +#define UNROLL8(var, code) UNROLL4(var, code) UNROLL4(var, code) +#define UNROLL16(var, code) UNROLL8(var, code) UNROLL8(var, code) +#define UNROLL32(var, code) UNROLL16(var, code) UNROLL16(var, code) + #endif diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCADisplay.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCADisplay.cxx index 1fb152883c6..4c2fcb7b55f 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCADisplay.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCADisplay.cxx @@ -813,7 +813,7 @@ void AliHLTTPCCADisplay::DrawSliceOutTrack( AliHLTTPCCATrackParam &/*t*/, double AliHLTTPCCADisplayTmpHit vHits[200]; for ( int ih = 0; ih < track.NHits(); ih++ ) { - int id = tracker.FirstSliceHit()[fSlice->Param().ISlice()] + fSlice->OutTrackHits()[track.FirstHitRef()+ih]; + int id = tracker.FirstSliceHit()[fSlice->Param().ISlice()] + fSlice->OutTrackHit(track.FirstHitRef()+ih); const AliHLTTPCCAGBHit &h = tracker.Hits()[id]; vHits[ih].SetID( id ); vHits[ih].SetS( 0 ); @@ -835,7 +835,7 @@ void AliHLTTPCCADisplay::DrawSliceOutTrack( int /*itr*/, int /*color*/, int /*wi AliHLTTPCCADisplayTmpHit vHits[200]; for ( int ih = 0; ih < track.NHits(); ih++ ) { - int id = tracker.FirstSliceHit()[fSlice->Param().ISlice()] + fSlice->OutTrackHits()[track.FirstHitRef()+ih]; + int id = tracker.FirstSliceHit()[fSlice->Param().ISlice()] + fSlice->OutTrackHit(track.FirstHitRef()+ih); const AliHLTTPCCAGBHit &h = tracker.Hits()[id]; vHits[ih].SetID( id ); vHits[ih].SetS( 0 ); diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUConfig.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUConfig.h new file mode 100644 index 00000000000..668b42ff125 --- /dev/null +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUConfig.h @@ -0,0 +1,73 @@ +#ifndef ALIHLTTPCCAGPUCONFIG_H +#define ALIHLTTPCCAGPUCONFIG_H + +#define HLTCA_GPU_BLOCK_COUNT 30 +#define HLTCA_GPU_THREAD_COUNT 256 + +#define HLTCA_GPU_WARP_SIZE 32 +#define HLTCA_GPU_REGS 64 +#define HLTCA_ROW_COUNT 159 + +#define HLTCA_GPU_ROWALIGNMENT uint4 +#define HLTCA_GPU_ROWCOPY int +#define HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS 32 +//#define HLTCA_GPU_PREFETCHDATA +//#define HLTCA_GPU_PREFETCH_ROWBLOCK_ONLY + +#define HLTCA_GPU_SCHED_ROW_STEP 32 +#define HLTCA_GPU_SCHED_FIXED_START +//#define HLTCA_GPU_SCHED_FIXED_SLICE +#define HLTCA_GPU_RESCHED + +#define HLTCA_GPU_TEXTURE_FETCH + +//#define HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE + +#define HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE 12 +#define HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT 3 //Currently must be smaller than avaiable MultiProcessors on GPU or will result in wrong results + +#define HLTCA_GPU_SORT_DUMPDATA + +#define HLTCA_GPU_MAX_TRACKLETS 12288 //Should be divisible by 16 at least +#define HLTCA_GPU_MAX_TRACKS 3072 + +//#define HLTCA_GPU_EMULATION_SINGLE_TRACKLET 1313 +//#define HLTCA_GPU_EMULATION_DEBUG_TRACKLET 1313 + +#define HLTCA_GPU_TRACKER_CONSTANT_MEM 65000 + +#define HLTCA_GPU_TRACKER_OBJECT_MEMORY 1024 * 1024 +#define HLTCA_GPU_ROWS_MEMORY 1024 * 1024 +#define HLTCA_GPU_COMMON_MEMORY 1024 * 1024 +#define HLTCA_GPU_SLICE_DATA_MEMORY 7 * 1024 * 1024 +#define HLTCA_GPU_GLOBAL_MEMORY 20 * 1024 * 1024 +#define HLTCA_GPU_TRACKS_MEMORY 2 * 1024 * 1024 + +#ifndef HLTCA_GPUCODE +#ifdef HLTCA_GPU_TEXTURE_FETCH +#undef HLTCA_GPU_TEXTURE_FETCH +#endif + +#ifdef HLTCA_GPU_PREFETCHDATA +#undef HLTCA_GPU_PREFETCHDATA +#endif + +#undef HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE +#define HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE 0 + +#else +#define HLTCA_GPU_SORT_STARTHITS +#endif + +#if !defined(HLTCA_GPU_PREFETCHDATA) | !defined(HLTCA_GPU_RESCHED) +#undef HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS +#define HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS 0 +#endif + +#define HLTCA_GPU_ERROR_NONE 0 +#define HLTCA_GPU_ERROR_ROWBLOCK_TRACKLET_OVERFLOW 1 +#define HLTCA_GPU_ERROR_TRACKLET_OVERFLOW 2 +#define HLTCA_GPU_ERROR_TRACK_OVERFLOW 3 +#define HLTCA_GPU_ERROR_SCHEDULE_COLLISION 4 + +#endif \ No newline at end of file diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu index b881a5e4f9b..e5530ceb099 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu @@ -17,8 +17,9 @@ // * //*************************************************************************** -#include -#include +#include "AliHLTTPCCADef.h" +#include "AliHLTTPCCAGPUConfig.h" + #include #include @@ -32,7 +33,14 @@ #include "AliHLTTPCCAGPUTracker.h" -#ifdef BUILD_GPU +__constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )]; +#ifdef HLTCA_GPU_TEXTURE_FETCH +texture gAliTexRefu2; +texture gAliTexRefu; +texture gAliTexRefs; +#endif + +#include "AliHLTTPCCAHit.h" //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code #include "AliHLTTPCCATrackParam.cxx" @@ -54,56 +62,182 @@ #include "AliHLTTPCCANeighboursCleaner.cxx" #include "AliHLTTPCCAStartHitsFinder.cxx" +#include "AliHLTTPCCAStartHitsSorter.cxx" #include "AliHLTTPCCATrackletConstructor.cxx" #include "AliHLTTPCCASliceOutput.cxx" +#include "MemoryAssignmentHelpers.h" + +#ifndef HLTCA_STANDALONE +#include "AliHLTDefinitions.h" +#include "AliHLTSystem.h" #endif -AliHLTTPCCAGPUTracker::AliHLTTPCCAGPUTracker() : gpuTracker(), DebugLevel(0) {} -AliHLTTPCCAGPUTracker::~AliHLTTPCCAGPUTracker() {} +ClassImp( AliHLTTPCCAGPUTracker ) -//Find best CUDA device, initialize and allocate memory -int AliHLTTPCCAGPUTracker::InitGPU() +int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID) { -#ifdef BUILD_GPU - int cudaDevice = cutGetMaxGflopsDeviceId(); - cudaSetDevice(cudaDevice); - - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop ,0 ); - std::cout<<"CUDA Device Properties: "<= 2) std::cout << "Available CUDA devices: "; + for (int i = 0;i < count;i++) + { + cudaGetDeviceProperties(&fCudaDeviceProp, i); + if (fDebugLevel >= 2) std::cout << fCudaDeviceProp.name << " (" << i << ") "; + if (fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate > bestDeviceSpeed) + { + bestDevice = i; + bestDeviceSpeed = fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate; + } + } + if (fDebugLevel >= 2) std::cout << std::endl; + + if (bestDevice == -1) + { + HLTWarning("No CUDA Device available, aborting CUDA Initialisation"); + return(1); + } + + int cudaDevice; + if (forceDeviceID == -1) + cudaDevice = bestDevice; + else + cudaDevice = forceDeviceID; +#else + int cudaDevice = 0; +#endif + + cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice ); + + if (fDebugLevel >= 1) { - std::cout << "CUDA Memory Allocation Error\n"; + std::cout<<"CUDA Device Properties: "< HLTCA_GPU_COMMON_MEMORY) + { + HLTError("Insufficiant Common Memory"); + cudaThreadExit(); + return(1); + } + + if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY) + { + HLTError("Insufficiant Row Memory"); + cudaThreadExit(); + return(1); + } + + fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY); + if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize))) + { + HLTError("CUDA Memory Allocation Error"); + cudaThreadExit(); + return(1); + } + if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize); + int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY; + if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize))) + { + cudaFree(fGPUMemory); + cudaThreadExit(); + HLTError("Error allocating Page Locked Host Memory"); + return(1); + } + if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize); + + if (fDebugLevel >= 1) + { + CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize)); + } + HLTInfo("CUDA Initialisation successfull"); + + //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory + if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY) + { + HLTError("Insufficiant Tracker Object Memory"); + return(1); + } + fSliceCount = sliceCount; + fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0); + + for (int i = 0;i < fgkNSlices;i++) + { + fSlaveTrackers[i].SetGPUTracker(); + fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i)); + fSlaveTrackers[i].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i)); + } + + fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t)); + cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams; + for (int i = 0;i < CAMath::Max(3, fSliceCount);i++) + { + if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i]))) + { + HLTError("Error creating CUDA Stream"); + return(1); + } + } + +#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION) + if (fDebugLevel < 2) + { + //Do one initial run for Benchmark reasons + const int useDebugLevel = fDebugLevel; + fDebugLevel = 0; + AliHLTTPCCAClusterData tmpCluster; + AliHLTTPCCASliceOutput tmpOutput; + AliHLTTPCCAParam tmpParam; + tmpParam.SetNRows(HLTCA_ROW_COUNT); + fSlaveTrackers[0].SetParam(tmpParam); + Reconstruct(&tmpOutput, &tmpCluster, 0, 1); + fDebugLevel = useDebugLevel; + } +#endif + return(0); } -//Macro to align Pointers. -//Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker -//(As long as every single data structure is <= 1 MB) template inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment) { + //Macro to align Pointers. + //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker + //(As long as every single data structure is <= 1 MB) + size_t adr = (size_t) ptr; if (adr % alignment) { @@ -112,216 +246,561 @@ template inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int ali return((T*) adr); } -//Check for CUDA Error and in the case of an error display the corresponding error string -bool AliHLTTPCCAGPUTracker::CUDA_FAILED_MSG(cudaError_t error) +bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error) { + //Check for CUDA Error and in the case of an error display the corresponding error string if (error == cudaSuccess) return(false); - printf("CUDA Error: %d / %s\n", error, cudaGetErrorString(error)); + HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error)); return(true); } -//Wait for CUDA-Kernel to finish and check for CUDA errors afterwards -int AliHLTTPCCAGPUTracker::CUDASync() +int AliHLTTPCCAGPUTracker::CUDASync(char* state) { - if (DebugLevel == 0) return(0); + //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards + + if (fDebugLevel == 0) return(0); cudaError cuErr; cuErr = cudaGetLastError(); if (cuErr != cudaSuccess) { - printf("Cuda Error %s while invoking kernel\n", cudaGetErrorString(cuErr)); + HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state); return(1); } - if (CUDA_FAILED_MSG(cudaThreadSynchronize())) + if (CudaFailedMsg(cudaThreadSynchronize())) { - printf("CUDA Error while synchronizing\n"); + HLTError("CUDA Error while synchronizing (%s)", state); return(1); } - if (DebugLevel >= 4) printf("CUDA Sync Done\n"); + if (fDebugLevel >= 5) HLTInfo("CUDA Sync Done"); return(0); } -void AliHLTTPCCAGPUTracker::SetDebugLevel(int dwLevel, std::ostream *NewOutFile) +void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile) { - DebugLevel = dwLevel; - if (NewOutFile) OutFile = NewOutFile; + //Set Debug Level and Debug output File if applicable + fDebugLevel = dwLevel; + if (NewOutFile) fOutFile = NewOutFile; } -//Primary reconstruction function -int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCATracker* tracker) +int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int OptionValue) { - int nThreads; - int nBlocks; - int size; - - if (tracker->CheckEmptySlice()) + //Set a specific GPU Tracker Option { - if (DebugLevel >= 4) printf("Slice Empty, not running GPU Tracker\n"); - return(0); + HLTError("Unknown Option: %s", OptionName); + return(1); } + //No Options used at the moment + //return(0); +} - if (DebugLevel >= 3) +#ifdef HLTCA_STANDALONE +void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i) +{ + //Run Performance Query for timer i of slice iSlice + if (fDebugLevel >= 1) + { + AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i)); + } +} +#else +void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {} +#endif + +void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check) +{ + //Dump Rowblocks to File + if (fDebugLevel >= 4) { - *OutFile << endl << endl << "Slice: " << tracker->Param().ISlice() << endl; + *fOutFile << "RowBlock Tracklets" << std::endl; + + int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2); + int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2); + uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT); + CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost)); + CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost)); + CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost)); + CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); + + int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet; + for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++) + { + *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << + rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << endl << "Phase 1: "; + for (int j = 0;j < rowBlockPos[i].x;j++) + { + //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem! + *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", "; + if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k) + { + HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k); + } + k++; + if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1) + { + HLTError("Error, -1 Tracklet found"); + } + } + *fOutFile << endl << "Phase 2: "; + for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++) + { + *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", "; + } + *fOutFile << endl; + } + + if (check) + { + *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl; + for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++) + { + *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl; + } + } + + free(rowBlockPos); + free(rowBlockTracklets); + free(blockStartingTracklet); } +} - if (DebugLevel >= 4) printf("\n\nInitialising GPU Tracker\n"); - memcpy(&gpuTracker, tracker, sizeof(AliHLTTPCCATracker)); - char* tmpMem = alignPointer((char*) GPUMemory, 1024 * 1024); - gpuTracker.SetGPUTracker(); +__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits) +{ + //Initialize GPU RowBlocks and HitWeights + int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets; + int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights; + const int stride = blockDim.x * gridDim.x; + int4 i0, i1; + i0.x = i0.y = i0.z = i0.w = 0; + i1.x = i1.y = i1.z = i1.w = -1; + for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride) + RowBlockPos[i] = i0; + for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride) + rowBlockTracklets4[i] = i1; + for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride) + sliceDataHitWeights4[i] = i0; +} - if (DebugLevel >= 4) printf("Initialising GPU Common Memory\n"); - tmpMem = gpuTracker.SetGPUTrackerCommonMemory(tmpMem); - tmpMem = alignPointer(tmpMem, 1024 * 1024); +int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput* pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal) +{ + //Primary reconstruction function + cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams; - if (DebugLevel >= 4) printf("Initialising GPU Hits Memory\n"); - tmpMem = gpuTracker.SetGPUTrackerHitsMemory(tmpMem, tracker->NHitsTotal()); - tmpMem = alignPointer(tmpMem, 1024 * 1024); + if (sliceCountLocal == -1) sliceCountLocal = this->fSliceCount; - if (DebugLevel >= 4) printf("Initialising GPU Slice Data Memory\n"); - tmpMem = gpuTracker.fData.SetGPUSliceDataMemory(tmpMem, gpuTracker.fClusterData); - tmpMem = alignPointer(tmpMem, 1024 * 1024); - if (tmpMem - (char*) GPUMemory > GPUMemSize) + if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM) { - printf("Out of CUDA Memory\n"); + HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData)); return(1); } - - CUDA_FAILED_MSG(cudaMemcpy(gpuTracker.fCommonMemory, tracker->fCommonMemory, tracker->fCommonMemorySize, cudaMemcpyHostToDevice)); - CUDA_FAILED_MSG(cudaMemcpy(gpuTracker.fData.fMemory, tracker->fData.fMemory, tracker->fData.fMemorySize, cudaMemcpyHostToDevice)); - CUDA_FAILED_MSG(cudaMemcpyToSymbol(gAliHLTTPCCATracker, &gpuTracker, sizeof(AliHLTTPCCATracker))); - - if (DebugLevel >= 4) printf("Running GPU Neighbours Finder\n"); - AliHLTTPCCAProcess <<>>(); - if (CUDASync()) return 1; - if (DebugLevel >= 3) + if (fDebugLevel >= 4) { - *OutFile << "Neighbours Finder:" << endl; - CUDA_FAILED_MSG(cudaMemcpy(tracker->fData.fMemory, gpuTracker.fData.fMemory, tracker->fData.fMemorySize, cudaMemcpyDeviceToHost)); - tracker->DumpLinks(*OutFile); - } + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) + { + *fOutFile << endl << endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << endl; + } + } - if (DebugLevel >= 4) printf("Running GPU Neighbours Cleaner\n"); - AliHLTTPCCAProcess <<>>(); - if (CUDASync()) return 1; + memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal); - if (DebugLevel >= 3) + if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice + sliceCountLocal].Param().ISlice()); + if (fDebugLevel >= 5) HLTInfo("Allocating GPU Tracker memory and initializing constants"); + + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) { - *OutFile << "Neighbours Cleaner:" << endl; - CUDA_FAILED_MSG(cudaMemcpy(tracker->fData.fMemory, gpuTracker.fData.fMemory, tracker->fData.fMemorySize, cudaMemcpyDeviceToHost)); - tracker->DumpLinks(*OutFile); - } + //Make this a GPU Tracker + fGpuTracker[iSlice].SetGPUTracker(); + fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice)); + fGpuTracker[iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice)); + fGpuTracker[iSlice].pData()->SetPointers(&pClusterData[iSlice], false); + + //Set Pointers to GPU Memory + char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice); + + if (fDebugLevel >= 5) HLTInfo("Initialising GPU Hits Memory"); + tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters()); + tmpMem = alignPointer(tmpMem, 1024 * 1024); + + if (fDebugLevel >= 5) HLTInfo("Initialising GPU Tracklet Memory"); + tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/); + tmpMem = alignPointer(tmpMem, 1024 * 1024); + + if (fDebugLevel >= 5) HLTInfo("Initialising GPU Track Memory"); + tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters()); + tmpMem = alignPointer(tmpMem, 1024 * 1024); + + if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY) + { + HLTError("Insufficiant Track Memory"); + return(1); + } + + if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY) + { + HLTError("Insufficiant Global Memory"); + return(1); + } + + //Initialize Startup Constants + *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0; + *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0; + *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0; + fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal; + if (fDebugLevel >= 5) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount); + fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice; + fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal; + fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0; + fGpuTracker[iSlice].pData()->SetGPUTextureBase(fGpuTracker[0].Data().Memory()); + } - if (DebugLevel >= 4) printf("Running GPU Start Hits Finder\n"); - AliHLTTPCCAProcess <<>>(); - if (CUDASync()) return 1; +#ifdef HLTCA_GPU_TEXTURE_FETCH + cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc(); + size_t offset; + if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) + { + HLTError("Error binding CUDA Texture (Offset %d)", (int) offset); + return(1); + } + cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc(); + if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) + { + HLTError("Error binding CUDA Texture (Offset %d)", (int) offset); + return(1); + } + cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc(); + if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) + { + HLTError("Error binding CUDA Texture (Offset %d)", (int) offset); + return(1); + } +#endif - if (DebugLevel >= 4) printf("Obtaining Number of Start Hits from GPU: "); - CUDA_FAILED_MSG(cudaMemcpy(tracker->fCommonMemory, gpuTracker.fCommonMemory, tracker->fCommonMemorySize, cudaMemcpyDeviceToHost)); - if (DebugLevel >= 4) printf("%d\n", *tracker->NTracklets()); - else if (DebugLevel >= 2) printf("%3d ", *tracker->NTracklets()); + //Copy Tracker Object to GPU Memory + if (fDebugLevel >= 5) HLTInfo("Copying Tracker objects to GPU"); +#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE + if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1); + CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000)); +#endif + CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0])); - if (DebugLevel >= 3) + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) { - *OutFile << "Start Hits: (" << *tracker->NTracklets() << ")" << endl; - CUDA_FAILED_MSG(cudaMemcpy(tracker->fHitMemory, gpuTracker.fHitMemory, tracker->fHitMemorySize, cudaMemcpyDeviceToHost)); - tracker->DumpStartHits(*OutFile); - } - - /*tracker->RunNeighboursFinder(); - tracker->RunNeighboursCleaner(); - tracker->RunStartHitsFinder();*/ - - if (DebugLevel >= 4) printf("Initialising GPU Track Memory\n"); - tmpMem = gpuTracker.SetGPUTrackerTracksMemory(tmpMem, *tracker->NTracklets(), tracker->NHitsTotal()); - tmpMem = alignPointer(tmpMem, 1024 * 1024); - if (tmpMem - (char*) GPUMemory > GPUMemSize) + StandalonePerfTime(firstSlice + iSlice, 0); + + //Initialize GPU Slave Tracker + if (fDebugLevel >= 5) HLTInfo("Creating Slice Data"); + fSlaveTrackers[firstSlice + iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice)); + fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]); + if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY) + { + HLTError("Insufficiant Slice Data Memory"); + return(1); + } + + /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice()) + { + if (fDebugLevel >= 5) HLTInfo("Slice Empty, not running GPU Tracker"); + if (sliceCountLocal == 1) + return(0); + }*/ + + //Initialize temporary memory where needed + if (fDebugLevel >= 5) HLTInfo("Copying Slice Data to GPU and initializing temporary memory"); + PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign()); + + //Copy Data to GPU Global Memory + CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); + CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); + CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); + + if (fDebugLevel >= 4) + { + if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory"); + fSlaveTrackers[firstSlice + iSlice].TrackletMemory() = reinterpret_cast ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ); + fSlaveTrackers[firstSlice + iSlice].SetPointersTracklets( HLTCA_GPU_MAX_TRACKLETS ); + fSlaveTrackers[firstSlice + iSlice].HitMemory() = reinterpret_cast ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100] ); + fSlaveTrackers[firstSlice + iSlice].SetPointersHits( pClusterData[iSlice].NumberOfClusters() ); + } + + if (CUDASync("Initialization")) return(1); + StandalonePerfTime(firstSlice + iSlice, 1); + + if (fDebugLevel >= 5) HLTInfo("Running GPU Neighbours Finder"); + AliHLTTPCCAProcess <<>>(iSlice); + + if (CUDASync("Neighbours finder")) return 1; + + StandalonePerfTime(firstSlice + iSlice, 2); + + if (fDebugLevel >= 4) + { + CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost)); + fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile); + } + + if (fDebugLevel >= 5) HLTInfo("Running GPU Neighbours Cleaner"); + AliHLTTPCCAProcess <<>>(iSlice); + if (CUDASync("Neighbours Cleaner")) return 1; + + StandalonePerfTime(firstSlice + iSlice, 3); + + if (fDebugLevel >= 4) + { + CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost)); + fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile); + } + + if (fDebugLevel >= 5) HLTInfo("Running GPU Start Hits Finder"); + AliHLTTPCCAProcess <<>>(iSlice); + if (CUDASync("Start Hits Finder")) return 1; + + StandalonePerfTime(firstSlice + iSlice, 4); + + if (fDebugLevel >= 5) HLTInfo("Running GPU Start Hits Sorter"); + AliHLTTPCCAProcess <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice); + if (CUDASync("Start Hits Sorter")) return 1; + + StandalonePerfTime(firstSlice + iSlice, 5); + + if (fDebugLevel >= 2) + { + CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); + if (fDebugLevel >= 5) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets()); + if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS) + { + HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant"); + return(1); + } + } + + if (fDebugLevel >= 4) + { + *fOutFile << "Temporary "; + CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost)); + fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile); + uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows()); + CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost)); + *fOutFile << "Start Hits Sort Vector:" << std::endl; + for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++) + { + *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl; + } + free(tmpMemory); + + CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost)); + fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile); + } + + StandalonePerfTime(firstSlice + iSlice, 6); + + fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters()); + } + + StandalonePerfTime(firstSlice, 7); +#ifdef HLTCA_GPU_PREFETCHDATA + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) { - printf("Out of CUDA Memory\n"); - return(1); + if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4)) + { + HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4)); + return(1); + } + if (fDebugLevel >= 1) + { + static int infoShown = 0; + if (!infoShown) + { + HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v)); + infoShown = 1; + } + } } +#endif - tracker->fData.ClearHitWeights(); - CUDA_FAILED_MSG(cudaMemcpy(gpuTracker.fData.fHitWeights, tracker->fData.fHitWeights, tracker->fData.fNumberOfHits * sizeof(int), cudaMemcpyHostToDevice)); - CUDA_FAILED_MSG(cudaMemcpyToSymbol(gAliHLTTPCCATracker, &gpuTracker, sizeof(AliHLTTPCCATracker))); + if (fDebugLevel >= 5) HLTInfo("Initialising Tracklet Constructor Scheduler"); + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) + { + AliHLTTPCCATrackletConstructorInit<<>>(iSlice); + if (CUDASync("Tracklet Initializer")) return 1; + DumpRowBlocks(fSlaveTrackers, iSlice); + } - if (DebugLevel >= 4) printf("Initialising Slice Tracker (CPU) Track Memory\n"); - tracker->fTrackMemory = reinterpret_cast ( new uint4 [ gpuTracker.fTrackMemorySize/sizeof( uint4 ) + 100] ); - tracker->SetPointersTracks( *tracker->NTracklets(), tracker->NHitsTotal() ); + if (fDebugLevel >= 5) HLTInfo("Running GPU Tracklet Constructor"); + AliHLTTPCCATrackletConstructorNewGPU<<>>(); + if (CUDASync("Tracklet Constructor (new)")) return 1; + + StandalonePerfTime(firstSlice, 8); -/* tracker->RunTrackletConstructor(); - if (DebugLevel >= 3) + if (fDebugLevel >= 4) { - *OutFile << "Tracklet Hits:" << endl; - tracker->DumpTrackletHits(*OutFile); - }*/ - - int nMemThreads = TRACKLET_CONSTRUCTOR_NMEMTHREDS; - nThreads = 256;//96; - nBlocks = *tracker->NTracklets()/nThreads + 1; - if( nBlocks<30 ){ - nBlocks = 30; - nThreads = (*tracker->NTracklets())/nBlocks+1; - if( nThreads%32 ) nThreads = (nThreads/32+1)*32; + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) + { + DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false); + CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); + if (fDebugLevel >= 5) + { + HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets()); + } + CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost)); + CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost)); + fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile); + } } - if (DebugLevel >= 4) printf("Running GPU Tracklet Constructor\n"); - //AliHLTTPCCAProcess1 <<>>(); - AliHLTTPCCAProcess1 <<<1, TRACKLET_CONSTRUCTOR_NMEMTHREDS + *tracker->fNTracklets>>>(); - if (CUDASync()) return 1; + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) + { + if (fDebugLevel >= 5) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT); + AliHLTTPCCAProcessMulti<<>>(iSlice, CAMath::Min(HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT, sliceCountLocal - iSlice)); + } + if (CUDASync("Tracklet Selector")) return 1; + StandalonePerfTime(firstSlice, 9); - if (DebugLevel >= 3) + CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0])); + for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++) { - *OutFile << "Tracklet Hits:" << endl; - CUDA_FAILED_MSG(cudaMemcpy(tracker->fNTracklets, gpuTracker.fNTracklets, sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_FAILED_MSG(cudaMemcpy(tracker->fTracklets, gpuTracker.fTracklets, gpuTracker.fTrackMemorySize, cudaMemcpyDeviceToHost)); - tracker->DumpTrackletHits(*OutFile); - } + if (iSliceTmp < sliceCountLocal) + { + int iSlice = iSliceTmp; + if (fDebugLevel >= 5) HLTInfo("Transfering Tracks from GPU to Host"); + cudaStreamSynchronize(cudaStreams[iSlice]); + CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice])); + CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice])); + if (iSlice + 1 < sliceCountLocal) + CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1])); + } + + if (iSliceTmp) + { + int iSlice = iSliceTmp - 1; + cudaStreamSynchronize(cudaStreams[iSlice]); + + if (fDebugLevel >= 4) + { + CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost)); + fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile); + fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile); + } + + if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError) + { + HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError); + return(1); + } + if (fDebugLevel >= 5) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits()); + + fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]); + fSlaveTrackers[firstSlice + iSlice].WriteOutput(); + + if (fDebugLevel >= 4) + { + delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory(); + delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory(); + } + } + } + + StandalonePerfTime(firstSlice, 10); + + if (fDebugLevel >= 5) HLTInfo("GPU Reconstruction finished"); - //tracker->RunTrackletSelector(); +#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE + char* stageAtSync = (char*) malloc(100000000); + CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost)); + cudaFree(fGpuTracker[0].fStageAtSync); + + FILE* fp = fopen("profile.txt", "w+"); + FILE* fp2 = fopen("profile.bmp", "w+b"); + int nEmptySync = 0, fEmpty; + + const int bmpheight = 1000; + BITMAPFILEHEADER bmpFH; + BITMAPINFOHEADER bmpIH; + ZeroMemory(&bmpFH, sizeof(bmpFH)); + ZeroMemory(&bmpIH, sizeof(bmpIH)); + bmpFH.bfType = 19778; //"BM" + bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ; + bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH); - nThreads = 128; - nBlocks = *tracker->NTracklets()/nThreads + 1; - if( nBlocks<30 ){ - nBlocks = 30; - nThreads = *tracker->NTracklets()/nBlocks+1; - nThreads = (nThreads/32+1)*32; - } - if (DebugLevel >= 4) printf("Running GPU Tracklet Selector\n"); - AliHLTTPCCAProcess<<>>(); - //AliHLTTPCCAProcess<<<1, *tracker->fNTracklets>>>(); - if (CUDASync()) return 1; - - if (DebugLevel >= 4) printf("Transfering Tracks from GPU to Host "); - CUDA_FAILED_MSG(cudaMemcpy(tracker->NTracks(), gpuTracker.NTracks(), sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_FAILED_MSG(cudaMemcpy(tracker->NTrackHits(), gpuTracker.NTrackHits(), sizeof(int), cudaMemcpyDeviceToHost)); - if (DebugLevel >= 4) printf("%d / %d\n", *tracker->fNTracks, *tracker->fNTrackHits); - size = sizeof(AliHLTTPCCATrack) * *tracker->NTracks(); - CUDA_FAILED_MSG(cudaMemcpy(tracker->Tracks(), gpuTracker.Tracks(), size, cudaMemcpyDeviceToHost)); - size = sizeof(AliHLTTPCCAHitId) * *tracker->NTrackHits(); - if (CUDA_FAILED_MSG(cudaMemcpy(tracker->TrackHits(), gpuTracker.TrackHits(), size, cudaMemcpyDeviceToHost))) + bmpIH.biSize = sizeof(bmpIH); + bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1; + bmpIH.biHeight = bmpheight; + bmpIH.biPlanes = 1; + bmpIH.biBitCount = 32; + + fwrite(&bmpFH, 1, sizeof(bmpFH), fp2); + fwrite(&bmpIH, 1, sizeof(bmpIH), fp2); + + for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT) { - printf("CUDA Error during Reconstruction\n"); - return(1); + fEmpty = 1; + for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++) + { + fprintf(fp, "%d\t", stageAtSync[i + j]); + int color = 0; + if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0); + if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0); + if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255); + if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0); + fwrite(&color, 1, sizeof(int), fp2); + if (j > 0 && j % 32 == 0) + { + color = RGB(255, 255, 255); + fwrite(&color, 1, 4, fp2); + } + if (stageAtSync[i + j]) fEmpty = 0; + } + fprintf(fp, "\n"); + if (fEmpty) nEmptySync++; + else nEmptySync = 0; + //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break; } - if (DebugLevel >= 3) - { - *OutFile << "Track Hits: (" << *tracker->NTracks() << ")" << endl; - tracker->DumpTrackHits(*OutFile); - } + fclose(fp); + fclose(fp2); + free(stageAtSync); +#endif - if (DebugLevel >= 4) printf("Running WriteOutput\n"); - tracker->WriteOutput(); + return(0); +} - if (DebugLevel >= 4) printf("GPU Reconstruction finished\n"); - +int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m) +{ + //Initialize Slice Tracker Parameter for a slave tracker + fSlaveTrackers[iSlice].Initialize(param); + if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT) + { + HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT); + return(1); + } return(0); } int AliHLTTPCCAGPUTracker::ExitGPU() { - cudaFree(GPUMemory); + //Uninitialize CUDA + cudaThreadSynchronize(); + if (fGPUMemory) + { + cudaFree(fGPUMemory); + fGPUMemory = NULL; + } + if (fHostLockedMemory) + { + for (int i = 0;i < CAMath::Max(3, fSliceCount);i++) + { + cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]); + } + free(fpCudaStreams); + fGpuTracker = NULL; + cudaFreeHost(fHostLockedMemory); + } + + if (CudaFailedMsg(cudaThreadExit())) + { + HLTError("Could not uninitialize GPU"); + return(1); + } + HLTInfo("CUDA Uninitialized"); return(0); } diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu.patch b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu.patch new file mode 100644 index 00000000000..de949511129 --- /dev/null +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu.patch @@ -0,0 +1,11 @@ +--- AliHLTTPCCAGPUTracker.cucpp 2009-05-28 12:14:09.000000000 +0200 ++++ release/x86_64-pc-linux-gnu/code/AliHLTTPCCAGPUTracker.cucpp 2009-05-28 12:10:25.000000000 +0200 +@@ -23186,7 +23186,7 @@ + static T2 *Alloc(int s) { auto T2 *p = (reinterpret_cast< T2 *>(_mm_malloc(s * sizeof(CacheLineSizeHelper< T> ), 128))); return new (p) T2 [s]; } + static void Free(T2 *const p, int size) { + for (int i = 0; i < size; ++i) { +-((p[i]).~CacheLineSizeHelper()); ++((p[i]).~T2()); + } + _mm_free(p); + } diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cxx new file mode 100644 index 00000000000..5c8f67aef8d --- /dev/null +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cxx @@ -0,0 +1,37 @@ +// ************************************************************************** +// This file is property of and copyright by the ALICE HLT Project * +// ALICE Experiment at CERN, All rights reserved. * +// * +// Primary Authors: Sergey Gorbunov * +// Ivan Kisel * +// David Rohr * +// for The ALICE HLT Project. * +// * +// Permission to use, copy, modify and distribute this software and its * +// documentation strictly for non-commercial purposes is hereby granted * +// without fee, provided that the above copyright notice appears in all * +// copies and that both the copyright notice and this permission notice * +// appear in the supporting documentation. The authors make no claims * +// about the suitability of this software for any purpose. It is * +// provided "as is" without express or implied warranty. * +// * +//*************************************************************************** + +//If not building GPU Code then build dummy functions to link against +#include "AliHLTTPCCAGPUTracker.h" + +int AliHLTTPCCAGPUTracker::InitGPU(int /*sliceCount*/, int /*forceDeviceID*/) +{ + //Dummy init function if CUDA is not available + HLTInfo("CUDA Compiler was not available during build process, omitting CUDA initialization"); + return(1); +} +void StandalonePerfTime(int /*iSlice*/, int /*i*/) {} +//template inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment) {return(NULL);} +//bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error) {return(true);} +//int AliHLTTPCCAGPUTracker::CUDASync() {return(1);} +void AliHLTTPCCAGPUTracker::SetDebugLevel(int /*dwLevel*/, std::ostream* /*NewOutFile*/) {} +int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* /*OptionName*/, int /*OptionValue*/) {return(1);} +int Reconstruct(AliHLTTPCCASliceOutput* /*pTracker*/, AliHLTTPCCAClusterData* /*pClusterData*/, int /*fFirstSlice*/, int /*fSliceCount*/) {return(1);} +int AliHLTTPCCAGPUTracker::ExitGPU() {return(0);} +int AliHLTTPCCAGPUTracker::InitializeSliceParam(int /*iSlice*/, AliHLTTPCCAParam& /*param*/) {} diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.h index b44819f8b39..fd432e5461c 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.h @@ -5,36 +5,79 @@ // * //************************************************************************* +#ifndef ALIHLTTPCCAGPUTRACKER_H +#define ALIHLTTPCCAGPUTRACKER_H + #include "AliHLTTPCCADef.h" #include "AliHLTTPCCATracker.h" +#include "AliHLTLogging.h" + +class AliHLTTPCCARow; -class AliHLTTPCCAGPUTracker +class AliHLTTPCCAGPUTracker : AliHLTLogging { public: - AliHLTTPCCAGPUTracker(); - ~AliHLTTPCCAGPUTracker(); + AliHLTTPCCAGPUTracker() : + fGpuTracker(NULL), + fGPUMemory(NULL), + fHostLockedMemory(NULL), + fDebugLevel(0), + fOutFile(NULL), + fGPUMemSize(0), + fpCudaStreams(NULL), + fSliceCount(0) + {}; + ~AliHLTTPCCAGPUTracker() {}; - int InitGPU(); - int Reconstruct(AliHLTTPCCATracker* tracker); + int InitGPU(int sliceCount = 1, int forceDeviceID = -1); + int Reconstruct(AliHLTTPCCASliceOutput* pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1); int ExitGPU(); - void SetDebugLevel(int dwLevel, std::ostream *NewOutFile = NULL); + void SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile = NULL); + int SetGPUTrackerOption(char* OptionName, int OptionValue); + + unsigned long long int* PerfTimer(int iSlice, unsigned int i) {return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL); } + + int InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m); private: - AliHLTTPCCATracker gpuTracker; - void* GPUMemory; + static void* RowMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + iSlice * sizeof(AliHLTTPCCARow) * (HLTCA_ROW_COUNT + 1) ); } + static void* CommonMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + iSlice * AliHLTTPCCATracker::CommonMemorySize() ); } + static void* SliceDataMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + iSlice * HLTCA_GPU_SLICE_DATA_MEMORY ); } + void* GlobalMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_GLOBAL_MEMORY ); } + void* TracksMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_TRACKS_MEMORY ); } + void* TrackerMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + iSlice * sizeof(AliHLTTPCCATracker) ); } + + void DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check = true); + + AliHLTTPCCATracker *fGpuTracker; + void* fGPUMemory; + void* fHostLockedMemory; - int CUDASync(); + int CUDASync(char* state = "UNKNOWN"); template T* alignPointer(T* ptr, int alignment); - int DebugLevel; - std::ostream *OutFile; - int GPUMemSize; + void StandalonePerfTime(int iSlice, int i); + + int fDebugLevel; //Debug Level for GPU Tracker + std::ostream* fOutFile; //Debug Output Stream Pointer + unsigned long long int fGPUMemSize; //Memory Size to allocate on GPU + + void* fpCudaStreams; + + int fSliceCount; + + static const int fgkNSlices = 36; + AliHLTTPCCATracker fSlaveTrackers[fgkNSlices]; #ifdef HLTCA_GPUCODE - bool CUDA_FAILED_MSG(cudaError_t error); + bool CudaFailedMsg(cudaError_t error); #endif + // disable copy AliHLTTPCCAGPUTracker( const AliHLTTPCCAGPUTracker& ); AliHLTTPCCAGPUTracker &operator=( const AliHLTTPCCAGPUTracker& ); + ClassDef( AliHLTTPCCAGPUTracker, 0 ) }; + +#endif \ No newline at end of file diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.cxx index eacbaab4c91..501f10aab35 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.cxx @@ -30,6 +30,7 @@ GPUd() void AliHLTTPCCAGrid::CreateEmpty() { + //Create an empty grid fYMin = 0.f; fYMax = 1.f; fZMin = 0.f; @@ -87,7 +88,7 @@ int AliHLTTPCCAGrid::GetBinBounded( float Y, float Z ) const return bin; } -GPUd() void AliHLTTPCCAGrid::GetBin( float Y, float Z, int *bY, int *bZ ) const +GPUd() void AliHLTTPCCAGrid::GetBin( float Y, float Z, int* const bY, int* const bZ ) const { //* get the bin pointer diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.h index 7a8af5d921f..e08bf7d8498 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.h @@ -31,7 +31,7 @@ class AliHLTTPCCAGrid * returns -1 if the row is empty == no hits */ GPUd() int GetBinBounded( float Y, float Z ) const; - GPUd() void GetBin( float Y, float Z, int *bY, int *bZ ) const; + GPUd() void GetBin( float Y, float Z, int* const bY, int* const bZ ) const; GPUd() unsigned int N() const { return fN; } GPUd() unsigned int Ny() const { return fNy; } @@ -43,10 +43,7 @@ class AliHLTTPCCAGrid GPUd() float StepYInv() const { return fStepYInv; } GPUd() float StepZInv() const { return fStepZInv; } -#ifndef CUDA_DEVICE_EMULATION private: -#endif - unsigned int fNy; //* N bins in Y unsigned int fNz; //* N bins in Z unsigned int fN; //* total N bins diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAHit.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAHit.h index f2279c678f7..f010439c884 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAHit.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAHit.h @@ -29,10 +29,7 @@ class AliHLTTPCCAHit GPUhd() void SetY( float v ) { fY = v; } GPUhd() void SetZ( float v ) { fZ = v; } -#ifndef CUDA_DEVICE_EMULATION protected: -#endif - float fY, fZ; // Y and Z position of the TPC cluster }; diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.cxx index a37d3585cbb..549e232d27d 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.cxx @@ -20,7 +20,7 @@ #include "AliHLTTPCCATracker.h" #include "AliHLTTPCCAGrid.h" #include "AliHLTTPCCAHit.h" -#include "AliHLTTPCCARow.h" +class AliHLTTPCCARow; GPUd() void AliHLTTPCCAHitArea::Init( const AliHLTTPCCARow &row, const AliHLTTPCCASliceData &slice, float y, float z, @@ -45,8 +45,13 @@ GPUd() void AliHLTTPCCAHitArea::Init( const AliHLTTPCCARow &row, const AliHLTTPC fIz = bZmin; // for given fIz (which is min atm.) get +#ifdef HLTCA_GPU_TEXTURE_FETCHa + fHitYfst = tex1Dfetch(gAliTexRefu, ((char*) slice.FirstHitInBin(row) - slice.GPUTextureBaseConst()) / sizeof(unsigned short) + fIndYmin); + fHitYlst = tex1Dfetch(gAliTexRefu, ((char*) slice.FirstHitInBin(row) - slice.GPUTextureBaseConst()) / sizeof(unsigned short) + fIndYmin + fBDY); +#else fHitYfst = slice.FirstHitInBin( row, fIndYmin ); // first and fHitYlst = slice.FirstHitInBin( row, fIndYmin + fBDY ); // last hit index in the bin +#endif fIh = fHitYfst; } @@ -72,13 +77,24 @@ GPUd() int AliHLTTPCCAHitArea::GetNext( const AliHLTTPCCATracker &tracker, const // go to next z and start y from the min again ++fIz; fIndYmin += fNy; +#ifdef HLTCA_GPU_TEXTURE_FETCHa + fHitYfst = tex1Dfetch(gAliTexRefu, ((char*) slice.FirstHitInBin(row) - slice.GPUTextureBaseConst()) / sizeof(unsigned short) + fIndYmin); + fHitYlst = tex1Dfetch(gAliTexRefu, ((char*) slice.FirstHitInBin(row) - slice.GPUTextureBaseConst()) / sizeof(unsigned short) + fIndYmin + fBDY); +#else fHitYfst = slice.FirstHitInBin( row, fIndYmin ); fHitYlst = slice.FirstHitInBin( row, fIndYmin + fBDY ); +#endif fIh = fHitYfst; } - h->SetY( y0 + tracker.HitDataY( row, fIh ) * stepY ); +#ifdef HLTCA_GPU_TEXTURE_FETCHa + ushort2 tmpval = tex1Dfetch(gAliTexRefu2, ((char*) slice.HitData(row) - slice.GPUTextureBaseConst()) / sizeof(ushort2) + fIh);; + h->SetY( y0 + tmpval.x * stepY ); + h->SetZ( z0 + tmpval.y * stepZ ); +#else + h->SetY( y0 + tracker.HitDataY( row, fIh ) * stepY ); h->SetZ( z0 + tracker.HitDataZ( row, fIh ) * stepZ ); +#endif if ( 1 && ( h->Z() > fMaxZ || h->Z() < fMinZ || h->Y() < fMinY || h->Y() > fMaxY ) ) { //SG!!! fIh++; diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.h index f1b82147b32..4084a2f6538 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.h @@ -59,10 +59,7 @@ class AliHLTTPCCAHitArea int Ny() const { return fNy; } int HitOffset() const { return fHitOffset; } -#ifndef CUDA_DEVICE_EMULATION protected: -#endif - float fY; // search coordinates float fZ; // search coordinates float fMinZ; // search coordinates diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitId.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitId.h index 85012836129..1d0cc2eb61e 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitId.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAHitId.h @@ -24,9 +24,7 @@ class AliHLTTPCCAHitId GPUhd() int RowIndex() const { return fId & 0xff; } GPUhd() int HitIndex() const { return fId >> 8; } -#ifndef CUDA_DEVICE_EMULATION private: -#endif int fId; }; diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAMerger.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAMerger.cxx index 40907d72b17..9fa589d7c93 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAMerger.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAMerger.cxx @@ -847,9 +847,9 @@ void AliHLTTPCCAMerger::Merging() } fOutput->SetNTracks( nOutTracks ); -#ifdef HLTCA_STANDALONE + #ifdef HLTCA_STANDALONE printf("Tracks Output: %d\n", nOutTracks); -#endif + #endif fOutput->SetNTrackClusters( nOutTrackClusters ); fOutput->SetPointers(); diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursCleaner.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursCleaner.h index 4987d8eb5d7..ad5c1bc9a7e 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursCleaner.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursCleaner.h @@ -33,10 +33,7 @@ class AliHLTTPCCANeighboursCleaner AliHLTTPCCASharedMemory& operator=( const AliHLTTPCCASharedMemory& /*dummy*/ ) { return *this; } #endif -#ifndef CUDA_DEVICE_EMULATION protected: -#endif - int fIRow; // current row index int fIRowUp; // current row index int fIRowDn; // current row index diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursFinder.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursFinder.cxx index 93fb8eee292..715754f5bb4 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursFinder.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursFinder.cxx @@ -35,11 +35,27 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread //* find neighbours if ( iSync == 0 ) { +#ifdef HLTCA_GPUCODE + for (int i = iThread;i < sizeof(AliHLTTPCCARow) / sizeof(int);i += nThreads) + { + reinterpret_cast(&s.fRow)[i] = reinterpret_cast(&tracker.SliceDataRows()[iBlock])[i]; + if (iBlock >= 2 && iBlock <= tracker.Param().NRows() - 3) + { + reinterpret_cast(&s.fRowUp)[i] = reinterpret_cast(&tracker.SliceDataRows()[iBlock + 2])[i]; + reinterpret_cast(&s.fRowDown)[i] = reinterpret_cast(&tracker.SliceDataRows()[iBlock - 2])[i]; + } + } + __syncthreads(); +#endif if ( iThread == 0 ) { s.fNRows = tracker.Param().NRows(); s.fIRow = iBlock; if ( s.fIRow < s.fNRows ) { - const AliHLTTPCCARow &row = tracker.Row( s.fIRow ); +#ifdef HLTCA_GPUCODE + const AliHLTTPCCARow &row = s.fRow; +#else + const AliHLTTPCCARow &row = tracker.Row( s.fIRow ); +#endif s.fNHits = row.NHits(); if ( ( s.fIRow >= 2 ) && ( s.fIRow <= s.fNRows - 3 ) ) { @@ -47,9 +63,14 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread s.fIRowDn = s.fIRow - 2; // references to the rows above and below + +#ifdef HLTCA_GPUCODE + const AliHLTTPCCARow &rowUp = s.fRowUp; + const AliHLTTPCCARow &rowDn = s.fRowDown; +#else const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRowUp ); const AliHLTTPCCARow &rowDn = tracker.Row( s.fIRowDn ); - +#endif // the axis perpendicular to the rows const float xDn = rowDn.X(); const float x = row.X(); @@ -67,29 +88,38 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread // UpTx/DnTx is used to move the HitArea such that central events are preferred (i.e. vertices // coming from y = 0, z = 0). - s.fGridUp = tracker.Row( s.fIRowUp ).Grid(); - s.fGridDn = tracker.Row( s.fIRowDn ).Grid(); + //s.fGridUp = tracker.Row( s.fIRowUp ).Grid(); + //s.fGridDn = tracker.Row( s.fIRowDn ).Grid(); } } } } else if ( iSync == 1 ) { if ( s.fIRow < s.fNRows ) { - if ( ( s.fIRow == 0 ) || ( s.fIRow == s.fNRows - 1 ) || ( s.fIRow == 1 ) || ( s.fIRow == s.fNRows - 2 ) ) { - const AliHLTTPCCARow &row = tracker.Row( s.fIRow ); + if ( ( s.fIRow <= 1 ) || ( s.fIRow >= s.fNRows - 2 ) ) { +#ifdef HLTCA_GPUCODE + const AliHLTTPCCARow &row = s.fRow; +#else + const AliHLTTPCCARow &row = tracker.Row( s.fIRow ); +#endif for ( int ih = iThread; ih < s.fNHits; ih += nThreads ) { tracker.SetHitLinkUpData( row, ih, -1 ); tracker.SetHitLinkDownData( row, ih, -1 ); } } else { - const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRowUp ); - const AliHLTTPCCARow &rowDn = tracker.Row( s.fIRowDn ); +/*#ifdef HLTCA_GPUCODE + const AliHLTTPCCARow &rowUp = s.fRowUp; + const AliHLTTPCCARow &rowDn = s.fRowDown; +#else + const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRowUp ); + const AliHLTTPCCARow &rowDn = tracker.Row( s.fIRowDn ); +#endif for ( unsigned int ih = iThread; ih < s.fGridUp.N() + s.fGridUp.Ny() + 2; ih += nThreads ) { s.fGridContentUp[ih] = tracker.FirstHitInBin( rowUp, ih ); } for ( unsigned int ih = iThread; ih < s.fGridDn.N() + s.fGridDn.Ny() + 2; ih += nThreads ) { s.fGridContentDn[ih] = tracker.FirstHitInBin( rowDn, ih ); - } + }*/ } } } else if ( iSync == 2 ) { @@ -98,20 +128,30 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread float chi2Cut = 3.*3.*4 * ( s.fUpDx * s.fUpDx + s.fDnDx * s.fDnDx ); const float kAreaSize = tracker.Param().NeighboursSearchArea(); //float chi2Cut = 3.*3.*(s.fUpDx*s.fUpDx + s.fDnDx*s.fDnDx ); //SG - const int kMaxN = 20; - - const AliHLTTPCCARow &row = tracker.Row( s.fIRow ); - const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRowUp ); - const AliHLTTPCCARow &rowDn = tracker.Row( s.fIRowDn ); +#define kMaxN 20 + +#ifdef HLTCA_GPUCODE + const AliHLTTPCCARow &row = s.fRow; + const AliHLTTPCCARow &rowUp = s.fRowUp; + const AliHLTTPCCARow &rowDn = s.fRowDown; +#else + const AliHLTTPCCARow &row = tracker.Row( s.fIRow ); + const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRowUp ); + const AliHLTTPCCARow &rowDn = tracker.Row( s.fIRowDn ); +#endif const float y0 = row.Grid().YMin(); const float z0 = row.Grid().ZMin(); const float stepY = row.HstepY(); const float stepZ = row.HstepZ(); - for ( int ih = iThread; ih < s.fNHits; ih += nThreads ) { + for ( int ih = iThread; ih < s.fNHits; ih += nThreads ) { unsigned short *neighUp = s.fB[iThread]; float2 *yzUp = s.fA[iThread]; +#if defined(HLTCA_GPUCODE) & kMaxN > ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP + unsigned short neighUp2[kMaxN - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP]; + float2 yzUp2[kMaxN - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP]; +#endif //unsigned short neighUp[5]; //float2 yzUp[5]; @@ -123,8 +163,14 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread int nNeighUp = 0; // coordinates of the hit in the current row +#if defined(HLTCA_GPU_TEXTURE_FETCHa) + ushort2 tmpval = tex1Dfetch(gAliTexRefu2, ((char*) tracker.Data().HitData() - tracker.pData()->GPUTextureBase()) / sizeof(ushort2) + row.HitNumberOffset() + ih); + const float y = y0 + tmpval.x * stepY; + const float z = z0 + tmpval.y * stepZ; +#else const float y = y0 + tracker.HitDataY( row, ih ) * stepY; const float z = z0 + tracker.HitDataZ( row, ih ) * stepZ; +#endif AliHLTTPCCAHitArea areaDn, areaUp; // TODO: for NVIDIA GPUs it should use the GridContentUp/-Dn that got copied into shared mem @@ -135,8 +181,18 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread AliHLTTPCCAHit h; int i = areaUp.GetNext( tracker, rowUp, tracker.Data(), &h ); if ( i < 0 ) break; - neighUp[nNeighUp] = ( unsigned short ) i; - yzUp[nNeighUp] = CAMath::MakeFloat2( s.fDnDx * ( h.Y() - y ), s.fDnDx * ( h.Z() - z ) ); +#if defined(HLTCA_GPUCODE) & kMaxN > ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP + if (nNeighUp >= ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP) + { + neighUp2[nNeighUp - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP] = ( unsigned short ) i; + yzUp2[nNeighUp - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP] = CAMath::MakeFloat2( s.fDnDx * ( h.Y() - y ), s.fDnDx * ( h.Z() - z ) ); + } + else +#endif + { + neighUp[nNeighUp] = ( unsigned short ) i; + yzUp[nNeighUp] = CAMath::MakeFloat2( s.fDnDx * ( h.Y() - y ), s.fDnDx * ( h.Z() - z ) ); + } if ( ++nNeighUp >= kMaxN ) break; } while ( 1 ); @@ -156,7 +212,12 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread float2 yzdn = CAMath::MakeFloat2( s.fUpDx * ( h.Y() - y ), s.fUpDx * ( h.Z() - z ) ); for ( int iUp = 0; iUp < nNeighUp; iUp++ ) { +#if defined(HLTCA_GPUCODE) & kMaxN > ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP + float2 yzup = iUp >= ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP ? yzUp2[iUp - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP] : yzUp[iUp]; +#else float2 yzup = yzUp[iUp]; +#endif + float dy = yzdn.x - yzup.x; float dz = yzdn.y - yzup.y; float d = dy * dy + dz * dz; @@ -169,7 +230,11 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread } while ( 1 ); if ( bestD <= chi2Cut ) { +#if defined(HLTCA_GPUCODE) & kMaxN > ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP + linkUp = bestUp >= ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP ? neighUp2[bestUp - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP] : neighUp[bestUp]; +#else linkUp = neighUp[bestUp]; +#endif linkDn = bestDn; } } diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursFinder.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursFinder.h index 3dd21def0a8..ff6dbeee299 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursFinder.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursFinder.h @@ -11,7 +11,7 @@ #include "AliHLTTPCCADef.h" -#include "AliHLTTPCCAGrid.h" +#include "AliHLTTPCCARow.h" class AliHLTTPCCATracker; /** @@ -27,18 +27,14 @@ class AliHLTTPCCANeighboursFinder public: #if !defined(HLTCA_GPUCODE) AliHLTTPCCASharedMemory() - : fGridUp(), fGridDn(), fNHits( 0 ), fUpNHits( 0 ), fDnNHits( 0 ), fUpDx( 0 ), fDnDx( 0 ), fUpTx( 0 ), fDnTx( 0 ), fIRow( 0 ), fIRowUp( 0 ), fIRowDn( 0 ), fNRows( 0 ) {} + : fNHits( 0 ), fUpNHits( 0 ), fDnNHits( 0 ), fUpDx( 0 ), fDnDx( 0 ), fUpTx( 0 ), fDnTx( 0 ), fIRow( 0 ), fIRowUp( 0 ), fIRowDn( 0 ), fNRows( 0 ), fRow(), fRowUp(), fRowDown() {} AliHLTTPCCASharedMemory( const AliHLTTPCCASharedMemory& /*dummy*/ ) - : fGridUp(), fGridDn(), fNHits( 0 ), fUpNHits( 0 ), fDnNHits( 0 ), fUpDx( 0 ), fDnDx( 0 ), fUpTx( 0 ), fDnTx( 0 ), fIRow( 0 ), fIRowUp( 0 ), fIRowDn( 0 ), fNRows( 0 ) {} + : fNHits( 0 ), fUpNHits( 0 ), fDnNHits( 0 ), fUpDx( 0 ), fDnDx( 0 ), fUpTx( 0 ), fDnTx( 0 ), fIRow( 0 ), fIRowUp( 0 ), fIRowDn( 0 ), fNRows( 0 ), fRow(), fRowUp(), fRowDown() {} AliHLTTPCCASharedMemory& operator=( const AliHLTTPCCASharedMemory& /*dummy*/ ) { return *this; } #endif -#ifndef CUDA_DEVICE_EMULATION protected: -#endif - AliHLTTPCCAGrid fGridUp; // grid for the next row - AliHLTTPCCAGrid fGridDn; // grid for the previous row int fNHits; // n hits int fUpNHits; // n hits in the next row int fDnNHits; // n hits in the prev row @@ -52,8 +48,7 @@ class AliHLTTPCCANeighboursFinder int fNRows; // number of rows float2 fA[256][ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP]; // temp memory unsigned short fB[256][ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP]; // temp memory - unsigned short fGridContentUp[ALIHLTTPCCANEIGHBOURS_FINDER_MAX_FGRIDCONTENTUPDOWN]; // grid content for the next row - unsigned short fGridContentDn[ALIHLTTPCCANEIGHBOURS_FINDER_MAX_FGRIDCONTENTUPDOWN];// grid content for the previous row + AliHLTTPCCARow fRow, fRowUp, fRowDown; }; GPUd() static int NThreadSyncPoints() { return 2; } diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAParam.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAParam.h index 0c9ec3da33c..ab58e61acc4 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAParam.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAParam.h @@ -125,10 +125,7 @@ class AliHLTTPCCAParam GPUd() float GetBz( float x, float y, float z ) const; GPUd() float GetBz( const AliHLTTPCCATrackParam &t ) const; -#ifndef CUDA_DEVICE_EMULATION protected: -#endif - int fISlice; // slice number int fNRows; // number of rows diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAPerformance.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAPerformance.cxx index fa16795cd72..c8cbf4cc941 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAPerformance.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAPerformance.cxx @@ -365,7 +365,7 @@ void AliHLTTPCCAPerformance::ReadMCPoint( int TrackID, float X, float Y, float Z p.SetTime( Time ); p.SetISlice( iSlice ); float sx, sy, sz; - AliHLTTPCCAStandaloneFramework::Instance().SliceTracker( iSlice ).Param().Global2Slice( X, Y, Z, &sx, &sy, &sz ); + AliHLTTPCCAStandaloneFramework::Instance().Param( iSlice ).Global2Slice( X, Y, Z, &sx, &sy, &sz ); p.SetSx( sx ); p.SetSy( sy ); p.SetSz( sz ); @@ -1035,7 +1035,11 @@ void AliHLTTPCCAPerformance::SliceTrackCandPerformance( int /*iSlice*/, bool /*P int nla = 0; for ( int irow = firstRow; irow <= lastRow; irow++ ) { - int ih = t.RowHit( irow ); +#ifdef EXTERN_ROW_HITS + int ih = slice.TrackletRowHits[iRow * *slice.NTracklets() + itr]; +#else + int ih = t.RowHit( irow ); +#endif if ( ih < 0 ) continue; int index = firstSliceHit + slice.HitInputID( slice.Row( irow ), ih ); AliHLTTPCCAHitLabel &l = fHitLabels[fTracker->Hits()[index].ID()]; @@ -1065,7 +1069,11 @@ void AliHLTTPCCAPerformance::SliceTrackCandPerformance( int /*iSlice*/, bool /*P } lmax = 0; for ( int irow = firstRow; irow <= lastRow; irow++ ) { - int ih = t.RowHit( irow ); +#ifdef EXTERN_ROW_HITS + int ih = slice.TrackletRowHits[iRow * *slice.NTracklets() + itr]; +#else + int ih = t.RowHit( irow ); +#endif if ( ih < 0 ) continue; int index = firstSliceHit + slice.HitInputID( slice.Row( irow ), ih ); AliHLTTPCCAHitLabel &l = fHitLabels[fTracker->Hits()[index].ID()]; @@ -1178,15 +1186,16 @@ void AliHLTTPCCAPerformance::SlicePerformance( int iSlice, bool PrintFlag ) int nRecTot = 0, nGhost = 0, nRecOut = 0; int nMCAll = 0, nRecAll = 0, nClonesAll = 0; int nMCRef = 0, nRecRef = 0, nClonesRef = 0; - const AliHLTTPCCATracker &tracker = hlt.SliceTracker( iSlice ); + //const AliHLTTPCCATracker &tracker = hlt.SliceTracker( iSlice ); + const AliHLTTPCCAClusterData &clusterdata = hlt.ClusterData(iSlice); // Select reconstructable MC tracks { for ( int imc = 0; imc < fNMCTracks; imc++ ) fMCTracks[imc].SetNHits( 0 ); - for ( int ih = 0; ih < tracker.ClusterData()->NumberOfClusters(); ih++ ) { - int id = tracker.ClusterData()->Id( ih ); + for ( int ih = 0; ih < clusterdata.NumberOfClusters(); ih++ ) { + int id = clusterdata.Id( ih ); if ( id < 0 || id > fNHits ) break; AliHLTTPCCAHitLabel &l = fHitLabels[id]; if ( l.fLab[0] >= 0 ) fMCTracks[l.fLab[0]].SetNHits( fMCTracks[l.fLab[0]].NHits() + 1 ); @@ -1210,9 +1219,9 @@ void AliHLTTPCCAPerformance::SlicePerformance( int iSlice, bool PrintFlag ) } } - if ( !tracker.Output() ) return; + //if ( !tracker.Output() ) return; - const AliHLTTPCCASliceOutput &output = *tracker.Output(); + const AliHLTTPCCASliceOutput &output = hlt.Output(iSlice); int traN = output.NTracks(); @@ -1618,7 +1627,8 @@ void AliHLTTPCCAPerformance::ClusterPerformance() t.SetSinPhi( dy * s ); t.SetSignCosPhi( dx ); t.SetDzDs( dz * s ); - hlt.SliceTracker( 0 ).GetErrors2( data.RowNumber( ic ), t, errY, errZ ); + //hlt.SliceTracker( 0 ).GetErrors2( data.RowNumber( ic ), t, errY, errZ ); + hlt.Param(0).GetClusterErrors2( data.RowNumber( ic ), t.GetZ(), t.SinPhi(), t.GetCosPhi(), t.DzDs(), errY, errZ ); errY = TMath::Sqrt( errY ); errZ = TMath::Sqrt( errZ ); } @@ -1737,7 +1747,8 @@ void AliHLTTPCCAPerformance::SmearClustersMC() t.SetSinPhi( dy * s ); t.SetSignCosPhi( dx ); t.SetDzDs( dz * s ); - hlt.SliceTracker( 0 ).GetErrors2( row0, t, errY, errZ ); + //hlt.SliceTracker( 0 ).GetErrors2( row0, t, errY, errZ ); + hlt.Param(0).GetClusterErrors2( row0, t.GetZ(), t.SinPhi(), t.GetCosPhi(), t.DzDs(), errY, errZ ); errY = TMath::Sqrt( errY ); errZ = TMath::Sqrt( errZ ); } diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAProcess.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAProcess.h index 3d7121482bb..8b0c245aaa7 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAProcess.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAProcess.h @@ -23,30 +23,31 @@ class AliHLTTPCCATracker; #if defined(HLTCA_GPUCODE) template -GPUg() void AliHLTTPCCAProcess() +GPUg() void AliHLTTPCCAProcess(int iSlice) { - AliHLTTPCCATracker &tracker = *( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker ); - + AliHLTTPCCATracker &tracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[iSlice]; GPUshared() typename TProcess::AliHLTTPCCASharedMemory smem; - TProcess::Thread( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, 0, smem, tracker ); - -#define GPUPROCESS(iSync) \ - if( TProcess::NThreadSyncPoints()>=iSync ){ \ - GPUsync(); \ - TProcess::Thread( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, iSync, smem, tracker ); \ + for( int iSync=0; iSync<=TProcess::NThreadSyncPoints(); iSync++){ + __syncthreads(); + TProcess::Thread( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, iSync, smem, tracker ); } +} - GPUPROCESS( 1 ) - GPUPROCESS( 2 ) - GPUPROCESS( 3 ) - - //for( int iSync=0; iSync<=TProcess::NThreadSyncPoints(); iSync++){ - //__syncthreads(); - //TProcess::ThreadGPU( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, iSync, smem, tracker ); - //} +template +GPUg() void AliHLTTPCCAProcessMulti(int firstSlice, int nSliceCount) +{ + const int iSlice = nSliceCount * (blockIdx.x + (gridDim.x % nSliceCount != 0 && nSliceCount * (blockIdx.x + 1) % gridDim.x != 0)) / gridDim.x; + const int nSliceBlockOffset = gridDim.x * iSlice / nSliceCount; + const int sliceBlockId = blockIdx.x - nSliceBlockOffset; + const int sliceGridDim = gridDim.x * (iSlice + 1) / nSliceCount - gridDim.x * (iSlice) / nSliceCount; + AliHLTTPCCATracker &tracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[firstSlice + iSlice]; + GPUshared() typename TProcess::AliHLTTPCCASharedMemory smem; -#undef GPUPROCESS + for( int iSync=0; iSync<=TProcess::NThreadSyncPoints(); iSync++){ + __syncthreads(); + TProcess::Thread( sliceGridDim, blockDim.x, sliceBlockId, threadIdx.x, iSync, smem, tracker ); + } } #else diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCARow.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCARow.h index a809048779f..d04a579a1e5 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCARow.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCARow.h @@ -44,10 +44,7 @@ class AliHLTTPCCARow GPUhd() int HitNumberOffset() const { return fHitNumberOffset; } GPUhd() unsigned int FirstHitInBinOffset() const { return fFirstHitInBinOffset; } -#ifndef CUDA_DEVICE_EMULATION private: -#endif - int fNHits; // number of hits float fX; // X coordinate of the row float fMaxY; // maximal Y coordinate of the row diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.cxx index 3a50b992fc1..5dd8e67e43f 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.cxx @@ -21,6 +21,8 @@ #include "AliHLTTPCCAHit.h" #include "AliHLTTPCCAParam.h" #include "MemoryAssignmentHelpers.h" +#include "AliHLTTPCCAGPUConfig.h" +#include "AliHLTTPCCAGPUTracker.h" #include // calculates an approximation for 1/sqrt(x) @@ -36,7 +38,7 @@ static inline float fastInvSqrt( float _x ) return x.f; } -inline void AliHLTTPCCASliceData::CreateGrid( AliHLTTPCCARow *row, const AliHLTTPCCAClusterData &data ) +inline void AliHLTTPCCASliceData::CreateGrid( AliHLTTPCCARow *row, const AliHLTTPCCAClusterData &data, int ClusterDataHitNumberOffset ) { // grid creation @@ -50,7 +52,7 @@ inline void AliHLTTPCCASliceData::CreateGrid( AliHLTTPCCARow *row, const AliHLTT float yMax = -1.e3f; float zMin = 1.e3f; float zMax = -1.e3f; - for ( int i = row->fHitNumberOffset; i < row->fHitNumberOffset + row->fNHits; ++i ) { + for ( int i = ClusterDataHitNumberOffset; i < ClusterDataHitNumberOffset + row->fNHits; ++i ) { const float y = data.Y( i ); const float z = data.Z( i ); if ( yMax < y ) yMax = y; @@ -94,8 +96,8 @@ inline void AliHLTTPCCASliceData::PackHitData( AliHLTTPCCARow *row, const AliHLT std::cout << "!!!! hit packing error!!! " << xx << " " << yy << " " << std::endl; } // HitData is bin sorted - fHitDataY[row->fHitNumberOffset + hitIndex] = xx; - fHitDataZ[row->fHitNumberOffset + hitIndex] = yy; + fHitData[row->fHitNumberOffset + hitIndex].x = xx; + fHitData[row->fHitNumberOffset + hitIndex].y = yy; } } @@ -107,49 +109,86 @@ void AliHLTTPCCASliceData::Clear() void AliHLTTPCCASliceData::InitializeRows( const AliHLTTPCCAParam &p ) { // initialisation of rows - + if (!fRows) fRows = new AliHLTTPCCARow[HLTCA_ROW_COUNT + 1]; for ( int i = 0; i < p.NRows(); ++i ) { fRows[i].fX = p.RowX( i ); fRows[i].fMaxY = CAMath::Tan( p.DAlpha() / 2. ) * fRows[i].fX; } } -GPUh() char* AliHLTTPCCASliceData::SetGPUSliceDataMemory(char* pGPUMemory, const AliHLTTPCCAClusterData *data) +#ifndef HLTCA_GPUCODE + AliHLTTPCCASliceData::~AliHLTTPCCASliceData() + { + //Standard Destrcutor + if (fRows) + { + if (!fIsGpuSliceData) delete[] fRows; + fRows = NULL; + } + if (fMemory) + { + if (!fIsGpuSliceData) delete[] fMemory; + fMemory = NULL; + } + + } +#endif + +GPUh() void AliHLTTPCCASliceData::SetGPUSliceDataMemory(void* const pSliceMemory, void* const pRowMemory) { - fMemory = (char*) pGPUMemory; - return(pGPUMemory + SetPointers(data, false)); + //Set Pointer to slice data memory to external memory + fMemory = (char*) pSliceMemory; + fRows = (AliHLTTPCCARow*) pRowMemory; } size_t AliHLTTPCCASliceData::SetPointers(const AliHLTTPCCAClusterData *data, bool allocate) { - const int numberOfRows = data->LastRow() - data->FirstRow(); - enum { kVectorAlignment = sizeof( int ) }; - const int numberOfHitsPlusAlignment = NextMultipleOf < kVectorAlignment / sizeof( int ) > ( fNumberOfHits ); + //Set slice data internal pointers + int hitMemCount = 0; + for ( int rowIndex = data->FirstRow(); rowIndex <= data->LastRow(); ++rowIndex ) + { + hitMemCount += NextMultipleOf(data->NumberOfClusters( rowIndex )); + } + //Calculate Memory needed to store hits in rows + + const int numberOfRows = data->LastRow() - data->FirstRow() + 1; + enum { kVectorAlignment = 256 /*sizeof( uint4 )*/ }; + fNumberOfHitsPlusAlign = NextMultipleOf < (kVectorAlignment > sizeof(HLTCA_GPU_ROWALIGNMENT) ? kVectorAlignment : sizeof(HLTCA_GPU_ROWALIGNMENT)) / sizeof( int ) > ( hitMemCount ); + fNumberOfHits = data->NumberOfClusters(); + const int firstHitInBinSize = (23 + sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(int)) * numberOfRows + 4 * fNumberOfHits + 3; + //FIXME: sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(int) * numberOfRows is way to big and only to ensure to reserve enough memory for GPU Alignment. + //Might be replaced by correct value + const int memorySize = // LinkData, HitData - numberOfHitsPlusAlignment * 4 * sizeof( short ) + + fNumberOfHitsPlusAlign * 4 * sizeof( short ) + // FirstHitInBin - NextMultipleOf( ( 23 * numberOfRows + 4 * fNumberOfHits + 3 ) * sizeof( int ) ) + + NextMultipleOf( ( firstHitInBinSize ) * sizeof( int ) ) + // HitWeights, ClusterDataIndex - numberOfHitsPlusAlignment * 2 * sizeof( int ); + fNumberOfHitsPlusAlign * 2 * sizeof( int ); if ( fMemorySize < memorySize ) { fMemorySize = memorySize; - if (allocate) + if (allocate && !fIsGpuSliceData) { - delete[] fMemory; + if (fMemory) + { + delete[] fMemory; + } fMemory = new char[fMemorySize + 4];// kVectorAlignment]; } } char *mem = fMemory; - AssignMemory( fLinkUpData, mem, numberOfHitsPlusAlignment ); - AssignMemory( fLinkDownData, mem, numberOfHitsPlusAlignment ); - AssignMemory( fHitDataY, mem, numberOfHitsPlusAlignment ); - AssignMemory( fHitDataZ, mem, numberOfHitsPlusAlignment ); - AssignMemory( fFirstHitInBin, mem, 23 * numberOfRows + 4 * fNumberOfHits + 3 ); - AssignMemory( fHitWeights, mem, numberOfHitsPlusAlignment ); - AssignMemory( fClusterDataIndex, mem, numberOfHitsPlusAlignment ); + AssignMemory( fLinkUpData, mem, fNumberOfHitsPlusAlign ); + AssignMemory( fLinkDownData, mem, fNumberOfHitsPlusAlign ); + AssignMemory( fHitData, mem, fNumberOfHitsPlusAlign ); + AssignMemory( fFirstHitInBin, mem, firstHitInBinSize ); + fGpuMemorySize = mem - fMemory; + + //Memory Allocated below will not be copied to GPU but instead be initialized on the gpu itself. Therefore it must not be copied to GPU! + AssignMemory( fHitWeights, mem, fNumberOfHitsPlusAlign ); + AssignMemory( fClusterDataIndex, mem, fNumberOfHitsPlusAlign ); return(mem - fMemory); } @@ -161,9 +200,7 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da // 1. prepare arrays //////////////////////////////////// -#ifdef DEBUG - const int numberOfRows = data.LastRow() - data.FirstRow(); -#endif //DEBUG + const int numberOfRows = data.LastRow() - data.FirstRow() + 1; fNumberOfHits = data.NumberOfClusters(); /* TODO Vectorization @@ -193,7 +230,7 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da row.fHstepYi = 1.f; row.fHstepZi = 1.f; } - for ( int rowIndex = data.LastRow() + 1; rowIndex < 160; ++rowIndex ) { + for ( int rowIndex = data.LastRow() + 1; rowIndex < HLTCA_ROW_COUNT + 1; ++rowIndex ) { AliHLTTPCCARow &row = fRows[rowIndex]; row.fGrid.CreateEmpty(); row.fNHits = 0; @@ -210,26 +247,31 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da } - AliHLTResizableArray binSortedHits( fNumberOfHits ); + AliHLTResizableArray binSortedHits( fNumberOfHits + sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v) * numberOfRows + 1 ); int gridContentOffset = 0; + int hitOffset = 0; int binCreationMemorySize = 103 * 2 + fNumberOfHits; AliHLTResizableArray binCreationMemory( binCreationMemorySize ); + fGPUSharedDataReq = 0; + for ( int rowIndex = data.FirstRow(); rowIndex <= data.LastRow(); ++rowIndex ) { AliHLTTPCCARow &row = fRows[rowIndex]; row.fNHits = data.NumberOfClusters( rowIndex ); assert( row.fNHits < ( 1 << sizeof( unsigned short ) * 8 ) ); - row.fHitNumberOffset = data.RowOffset( rowIndex ); + row.fHitNumberOffset = hitOffset; + hitOffset += NextMultipleOf(data.NumberOfClusters( rowIndex )); + row.fFirstHitInBinOffset = gridContentOffset; - CreateGrid( &row, data ); + CreateGrid( &row, data, data.RowOffset( rowIndex ) ); const AliHLTTPCCAGrid &grid = row.fGrid; const int numberOfBins = grid.N(); int binCreationMemorySizeNew; - if ( ( binCreationMemorySizeNew = numberOfBins * 2 + 6 + row.fNHits ) > binCreationMemorySize ) { + if ( ( binCreationMemorySizeNew = numberOfBins * 2 + 6 + row.fNHits + sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(unsigned short) * numberOfRows + 1) > binCreationMemorySize ) { binCreationMemorySize = binCreationMemorySizeNew; binCreationMemory.Resize( binCreationMemorySize ); } @@ -243,8 +285,9 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da } for ( int hitIndex = 0; hitIndex < row.fNHits; ++hitIndex ) { - const int globalHitIndex = row.fHitNumberOffset + hitIndex; + const int globalHitIndex = data.RowOffset( rowIndex ) + hitIndex; const unsigned short bin = row.fGrid.GetBin( data.Y( globalHitIndex ), data.Z( globalHitIndex ) ); + bins[hitIndex] = bin; ++filled[bin]; } @@ -260,7 +303,7 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da --filled[bin]; const unsigned short ind = c[bin] + filled[bin]; // generate an index for this hit that is >= c[bin] and < c[bin + 1] const int globalBinsortedIndex = row.fHitNumberOffset + ind; - const int globalHitIndex = row.fHitNumberOffset + hitIndex; + const int globalHitIndex = data.RowOffset( rowIndex ) + hitIndex; // allows to find the global hit index / coordinates from a global bin sorted hit index fClusterDataIndex[globalBinsortedIndex] = globalHitIndex; @@ -277,12 +320,18 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da // grid.N is <= row.fNHits const int nn = numberOfBins + grid.Ny() + 3; for ( int i = numberOfBins; i < nn; ++i ) { - assert( row.fFirstHitInBinOffset + i < 23 * numberOfRows + 4 * fNumberOfHits + 3 ); + assert( (signed) row.fFirstHitInBinOffset + i < 23 * numberOfRows + 4 * fNumberOfHits + 3 ); fFirstHitInBin[row.fFirstHitInBinOffset + i] = a; } row.fFullSize = nn; gridContentOffset += nn; + + if (NextMultipleOf(row.fNHits) + nn > (unsigned) fGPUSharedDataReq) + fGPUSharedDataReq = NextMultipleOf(row.fNHits) + nn; + + //Make pointer aligned + gridContentOffset = NextMultipleOf(gridContentOffset); } #if 0 @@ -351,7 +400,7 @@ void AliHLTTPCCASliceData::ClearHitWeights() v0.store( mem ); } #else - for ( int i = 0; i < fNumberOfHits; ++i ) { + for ( int i = 0; i < fNumberOfHitsPlusAlign; ++i ) { fHitWeights[i] = 0; } #endif diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h index ebdd8c6b759..c6fee6e193b 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h @@ -21,6 +21,7 @@ #include "AliHLTTPCCARow.h" #include "AliHLTTPCCAMath.h" #include "AliHLTArray.h" +#include "AliHLTTPCCAGPUConfig.h" typedef int int_v; typedef unsigned int uint_v; @@ -41,12 +42,18 @@ class AliHLTTPCCAParam; */ class AliHLTTPCCASliceData { - friend class AliHLTTPCCAGPUTracker; public: AliHLTTPCCASliceData() - : fNumberOfHits( 0 ), fMemorySize( 0 ), fMemory( 0 ), fLinkUpData( 0 ), - fLinkDownData( 0 ), fHitDataY( 0 ), fHitDataZ( 0 ), fClusterDataIndex( 0 ), - fFirstHitInBin( 0 ), fHitWeights( 0 ) {} + : + fIsGpuSliceData(0), fGPUSharedDataReq(0), fNumberOfHits( 0 ), fNumberOfHitsPlusAlign( 0 ), fMemorySize( 0 ), fGpuMemorySize( 0 ), fMemory( 0 ), fGPUTextureBase( 0 ) + ,fRows( NULL ), fLinkUpData( 0 ), fLinkDownData( 0 ), fHitData( 0 ), fClusterDataIndex( 0 ) + , fFirstHitInBin( 0 ), fHitWeights( 0 ) + { + } + +#ifndef HLTCA_GPUCODE + ~AliHLTTPCCASliceData(); +#endif void InitializeRows( const AliHLTTPCCAParam ¶meters ); @@ -55,7 +62,7 @@ class AliHLTTPCCASliceData * data. */ - char* SetGPUSliceDataMemory(char* pGPUMemory, const AliHLTTPCCAClusterData *data); + void SetGPUSliceDataMemory(void* const pSliceMemory, void* const pRowMemory); size_t SetPointers(const AliHLTTPCCAClusterData *data, bool allocate = false); void InitFromClusterData( const AliHLTTPCCAClusterData &data ); @@ -68,6 +75,7 @@ class AliHLTTPCCASliceData * Return the number of hits in this slice. */ GPUhd() int NumberOfHits() const { return fNumberOfHits; } + GPUhd() int NumberOfHitsPlusAlign() const { return fNumberOfHitsPlusAlign; } /** * Access to the hit links. @@ -76,6 +84,13 @@ class AliHLTTPCCASliceData */ short_v HitLinkUpData ( const AliHLTTPCCARow &row, const short_v &hitIndex ) const; short_v HitLinkDownData( const AliHLTTPCCARow &row, const short_v &hitIndex ) const; + + GPUhd() const ushort2 *HitData( const AliHLTTPCCARow &row ) const; + GPUhd() const ushort2 *HitData() const { return(fHitData); } + GPUd() const short_v *HitLinkUpData ( const AliHLTTPCCARow &row ) const; + GPUd() const short_v *HitLinkDownData( const AliHLTTPCCARow &row ) const; + GPUd() const ushort_v *FirstHitInBin( const AliHLTTPCCARow &row ) const; + void SetHitLinkUpData ( const AliHLTTPCCARow &row, const short_v &hitIndex, const short_v &value ); void SetHitLinkDownData( const AliHLTTPCCARow &row, const short_v &hitIndex, @@ -90,8 +105,9 @@ class AliHLTTPCCASliceData * Return the y and z coordinate(s) of the given hit(s). */ // TODO return float_v - short_v HitDataY( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const; - short_v HitDataZ( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const; + ushort_v HitDataY( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const; + ushort_v HitDataZ( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const; + ushort2 HitData( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const; /** * For a given bin index, content tells how many hits there are in the preceding bins. This maps @@ -125,34 +141,53 @@ class AliHLTTPCCASliceData * Return the row object for the given row index. */ const AliHLTTPCCARow &Row( int rowIndex ) const; + GPUhd() AliHLTTPCCARow* Rows() const {return fRows;} -#ifndef CUDA_DEVICE_EMULATION - private: -#endif + GPUh() char *Memory() const {return(fMemory); } + GPUh() size_t MemorySize() const {return(fMemorySize); } + GPUh() size_t GpuMemorySize() const {return(fGpuMemorySize); } + GPUh() int* HitWeights() const {return(fHitWeights); } - AliHLTTPCCASliceData( const AliHLTTPCCASliceData & ) - : fNumberOfHits( 0 ), fMemorySize( 0 ), fMemory( 0 ), fLinkUpData( 0 ), - fLinkDownData( 0 ), fHitDataY( 0 ), fHitDataZ( 0 ), fClusterDataIndex( 0 ), - fFirstHitInBin( 0 ), fHitWeights( 0 ) {} + GPUhd() void SetGPUTextureBase(char* val) {fGPUTextureBase = val;} + GPUhd() char* GPUTextureBase() const { return(fGPUTextureBase); } + GPUhd() char* GPUTextureBaseConst() const { return(fGPUTextureBase); } + + GPUh() int GPUSharedDataReq() const { return fGPUSharedDataReq; } + + void SetGpuSliceData() { fIsGpuSliceData = 1; } + private: + AliHLTTPCCASliceData( const AliHLTTPCCASliceData & ) + : + fIsGpuSliceData(0), fGPUSharedDataReq(0), fNumberOfHits( 0 ), fNumberOfHitsPlusAlign( 0 ), fMemorySize( 0 ), fGpuMemorySize( 0 ), fMemory( 0 ), fGPUTextureBase( 0 ) + ,fRows( NULL ), fLinkUpData( 0 ), fLinkDownData( 0 ), fHitData( 0 ), fClusterDataIndex( 0 ) + , fFirstHitInBin( 0 ), fHitWeights( 0 ) + { + } AliHLTTPCCASliceData& operator=( const AliHLTTPCCASliceData & ) { return *this; } - void CreateGrid( AliHLTTPCCARow *row, const AliHLTTPCCAClusterData &data ); + void CreateGrid( AliHLTTPCCARow *row, const AliHLTTPCCAClusterData &data, int ClusterDataHitNumberOffset ); void PackHitData( AliHLTTPCCARow *row, const AliHLTArray &binSortedHits ); - AliHLTTPCCARow fRows[200]; // The row objects needed for most accessor functions + int fIsGpuSliceData; //Slice Data for GPU Tracker? + int fGPUSharedDataReq; //Size of shared memory required for GPU Reconstruction int fNumberOfHits; // the number of hits in this slice + int fNumberOfHitsPlusAlign; + int fMemorySize; // size of the allocated memory in bytes + int fGpuMemorySize; // size of Memory needed to be transfered to GPU char *fMemory; // pointer to the allocated memory where all the following arrays reside in + char *fGPUTextureBase; // pointer to start of GPU texture - short *fLinkUpData; // hit index in the row above which is linked to the given (global) hit index + AliHLTTPCCARow *fRows; // The row objects needed for most accessor functions + + short *fLinkUpData; // hit index in the row above which is linked to the given (global) hit index short *fLinkDownData; // hit index in the row below which is linked to the given (global) hit index - unsigned short *fHitDataY; // packed y coordinate of the given (global) hit index - unsigned short *fHitDataZ; // packed z coordinate of the given (global) hit index + ushort2 *fHitData; // packed y,z coordinate of the given (global) hit index int *fClusterDataIndex; // see ClusterDataIndex() @@ -176,6 +211,26 @@ GPUd() inline short_v AliHLTTPCCASliceData::HitLinkDownData( const AliHLTTPCCARo return fLinkDownData[row.fHitNumberOffset + hitIndex]; } +GPUd() inline const ushort_v *AliHLTTPCCASliceData::FirstHitInBin( const AliHLTTPCCARow &row ) const +{ + return &fFirstHitInBin[row.fFirstHitInBinOffset]; +} + +GPUd() inline const short_v *AliHLTTPCCASliceData::HitLinkUpData ( const AliHLTTPCCARow &row ) const +{ + return &fLinkUpData[row.fHitNumberOffset]; +} + +GPUd() inline const short_v *AliHLTTPCCASliceData::HitLinkDownData( const AliHLTTPCCARow &row ) const +{ + return &fLinkDownData[row.fHitNumberOffset]; +} + +GPUhd() inline const ushort2 *AliHLTTPCCASliceData::HitData( const AliHLTTPCCARow &row ) const +{ + return &fHitData[row.fHitNumberOffset]; +} + GPUd() inline void AliHLTTPCCASliceData::SetHitLinkUpData ( const AliHLTTPCCARow &row, const short_v &hitIndex, const short_v &value ) { fLinkUpData[row.fHitNumberOffset + hitIndex] = value; @@ -186,14 +241,19 @@ GPUd() inline void AliHLTTPCCASliceData::SetHitLinkDownData( const AliHLTTPCCARo fLinkDownData[row.fHitNumberOffset + hitIndex] = value; } -GPUd() inline short_v AliHLTTPCCASliceData::HitDataY( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const +GPUd() inline ushort_v AliHLTTPCCASliceData::HitDataY( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const +{ + return fHitData[row.fHitNumberOffset + hitIndex].x; +} + +GPUd() inline ushort_v AliHLTTPCCASliceData::HitDataZ( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const { - return fHitDataY[row.fHitNumberOffset + hitIndex]; + return fHitData[row.fHitNumberOffset + hitIndex].y; } -GPUd() inline short_v AliHLTTPCCASliceData::HitDataZ( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const +GPUd() inline ushort2 AliHLTTPCCASliceData::HitData( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const { - return fHitDataZ[row.fHitNumberOffset + hitIndex]; + return fHitData[row.fHitNumberOffset + hitIndex]; } GPUd() inline ushort_v AliHLTTPCCASliceData::FirstHitInBin( const AliHLTTPCCARow &row, ushort_v binIndexes ) const diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceOutput.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceOutput.cxx index 842661d586e..59d7c694a4b 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceOutput.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceOutput.cxx @@ -18,6 +18,8 @@ //*************************************************************************** #include "AliHLTTPCCASliceOutput.h" +#include "MemoryAssignmentHelpers.h" + GPUhd() int AliHLTTPCCASliceOutput::EstimateSize( int nOfTracks, int nOfTrackClusters ) { @@ -29,6 +31,18 @@ GPUhd() int AliHLTTPCCASliceOutput::EstimateSize( int nOfTracks, int nOfTrackClu } #ifndef HLTCA_GPUCODE + +void AliHLTTPCCASliceOutput::Clear() +{ + //Clear Slice Output and free Memory + if (fMemory) delete[] fMemory; + fMemory = NULL; + fNOutTracks = 0; + fNOutTrackHits = 0; + fNTracks = 0; + fNTrackClusters = 0; +} + template inline void AssignNoAlignment( T *&dst, char *&mem, int count ) { // assign memory to the pointer dst @@ -36,11 +50,11 @@ template inline void AssignNoAlignment( T *&dst, char *&mem, int cou mem = ( char * )( dst + count ); } -GPUhd() void AliHLTTPCCASliceOutput::SetPointers() +void AliHLTTPCCASliceOutput::SetPointers() { // set all pointers - char *mem = &fMemory[0]; + char *mem = fMemory; AssignNoAlignment( fTracks, mem, fNTracks ); AssignNoAlignment( fClusterUnpackedYZ, mem, fNTrackClusters ); AssignNoAlignment( fClusterUnpackedX, mem, fNTrackClusters ); @@ -49,5 +63,24 @@ GPUhd() void AliHLTTPCCASliceOutput::SetPointers() AssignNoAlignment( fClusterRow, mem, fNTrackClusters ); AssignNoAlignment( fClusterPackedAmp, mem, fNTrackClusters ); + // memory for output tracks + + AssignMemory( fOutTracks, mem, fNTracks ); + + // arrays for track hits + + AssignMemory( fOutTrackHits, mem, fNTrackClusters ); + + + fMemorySize = (mem - fMemory); +} + +void AliHLTTPCCASliceOutput::Allocate() +{ + //Allocate All memory needed for slice output + if (fMemory) delete[] fMemory; + SetPointers(); // to calculate the size + fMemory = reinterpret_cast ( new uint4 [ fMemorySize/sizeof( uint4 ) + 100] ); + SetPointers(); // set pointers } #endif diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceOutput.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceOutput.h index d1e83fc0106..f20503b2c47 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceOutput.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceOutput.h @@ -11,8 +11,10 @@ #define ALIHLTTPCCASLICEOUTPUT_H #include "AliHLTTPCCADef.h" - +#include #include "AliHLTTPCCASliceTrack.h" +//Obsolete +#include "AliHLTTPCCAOutTrack.h" /** * @class AliHLTTPCCASliceOutput @@ -29,6 +31,15 @@ class AliHLTTPCCASliceOutput { public: + AliHLTTPCCASliceOutput() + : fNTracks( 0 ), fNTrackClusters( 0 ), fTracks( 0 ), fClusterId( 0 ), fClusterRow( 0 ), fClusterPackedYZ( 0 ), fClusterUnpackedYZ( 0 ), fClusterUnpackedX( 0 ), fClusterPackedAmp( 0 ), + fMemorySize( 0 ), fNOutTracks(0), fNOutTrackHits(0), fOutTracks(0), fOutTrackHits(0), fMemory(NULL) {} + + ~AliHLTTPCCASliceOutput() + { + if (fMemory) delete[] fMemory; + } + GPUhd() int NTracks() const { return fNTracks; } GPUhd() int NTrackClusters() const { return fNTrackClusters; } @@ -41,9 +52,8 @@ class AliHLTTPCCASliceOutput GPUhd() float ClusterUnpackedX ( int i ) const { return fClusterUnpackedX[i]; } GPUhd() static int EstimateSize( int nOfTracks, int nOfTrackClusters ); -#ifndef HLTCA_GPUCODE - GPUhd() void SetPointers(); -#endif + void SetPointers(); + void Allocate(); GPUhd() void SetNTracks ( int v ) { fNTracks = v; } GPUhd() void SetNTrackClusters( int v ) { fNTrackClusters = v; } @@ -56,12 +66,26 @@ class AliHLTTPCCASliceOutput GPUhd() void SetClusterUnpackedYZ( int i, float2 v ) { fClusterUnpackedYZ[i] = v; } GPUhd() void SetClusterUnpackedX( int i, float v ) { fClusterUnpackedX[i] = v; } - private: + char* Memory() const { return(fMemory); } + size_t MemorySize() const { return(fMemorySize); } + + void Clear(); + + //Obsolete Output - AliHLTTPCCASliceOutput( const AliHLTTPCCASliceOutput& ) - : fNTracks( 0 ), fNTrackClusters( 0 ), fTracks( 0 ), fClusterId( 0 ), fClusterRow( 0 ), fClusterPackedYZ( 0 ), fClusterUnpackedYZ( 0 ), fClusterUnpackedX( 0 ), fClusterPackedAmp( 0 ) {} + GPUhd() int NOutTracks() const { return(fNOutTracks); } + GPUhd() void SetNOutTracks(int val) { fNOutTracks = val; } + GPUhd() AliHLTTPCCAOutTrack *OutTracks() const { return fOutTracks; } + GPUhd() const AliHLTTPCCAOutTrack &OutTrack( int index ) const { return fOutTracks[index]; } + GPUhd() int NOutTrackHits() const { return fNOutTrackHits; } + GPUhd() void SetNOutTrackHits(int val) { fNOutTrackHits = val; } + GPUhd() void SetOutTrackHit(int n, int val) { fOutTrackHits[n] = val; } + GPUhd() int OutTrackHit( int i ) const { return fOutTrackHits[i]; } + + private: const AliHLTTPCCASliceOutput& operator=( const AliHLTTPCCASliceOutput& ) const { return *this; } + AliHLTTPCCASliceOutput( const AliHLTTPCCASliceOutput& ); int fNTracks; // number of reconstructed tracks int fNTrackClusters; // total number of track clusters @@ -72,7 +96,18 @@ class AliHLTTPCCASliceOutput float2 *fClusterUnpackedYZ; // pointer to cluster coordinates (temporary data, for debug proposes) float *fClusterUnpackedX; // pointer to cluster coordinates (temporary data, for debug proposes) UChar_t *fClusterPackedAmp; // pointer to packed cluster amplitudes - char fMemory[1]; // the memory where the pointers above point into + size_t fMemorySize; // Amount of memory really used + + // obsolete output + + int fNOutTracks; + int fNOutTrackHits; + AliHLTTPCCAOutTrack *fOutTracks; // output array of the reconstructed tracks + int *fOutTrackHits; // output array of ID's of the reconstructed hits + + //Must be last element of this class, user has to make sure to allocate anough memory consecutive to class memory! + char* fMemory; // the memory where the pointers above point into + }; #endif diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStandaloneFramework.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStandaloneFramework.cxx index 22ba97b564a..1bc93dc317b 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStandaloneFramework.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStandaloneFramework.cxx @@ -26,17 +26,8 @@ #include "AliHLTTPCCAClusterData.h" #include "TStopwatch.h" -//If not building GPU Code then build dummy functions to link against -#ifndef BUILD_GPU -AliHLTTPCCAGPUTracker::AliHLTTPCCAGPUTracker() : gpuTracker(),GPUMemory(0), DebugLevel(0), OutFile(0), GPUMemSize(0) {} -AliHLTTPCCAGPUTracker::~AliHLTTPCCAGPUTracker() {} -int AliHLTTPCCAGPUTracker::InitGPU() {return(0);} -//template inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment) {return(NULL);} -//bool AliHLTTPCCAGPUTracker::CUDA_FAILED_MSG(cudaError_t error) {return(true);} -//int AliHLTTPCCAGPUTracker::CUDASync() {return(1);} -void AliHLTTPCCAGPUTracker::SetDebugLevel(int /*dwLevel*/, std::ostream * /*NewOutFile*/) {}; -int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCATracker* /*tracker*/) {return(1);} -int AliHLTTPCCAGPUTracker::ExitGPU() {return(0);} +#ifdef HLTCA_STANDALONE +#include #endif AliHLTTPCCAStandaloneFramework &AliHLTTPCCAStandaloneFramework::Instance() @@ -47,7 +38,7 @@ AliHLTTPCCAStandaloneFramework &AliHLTTPCCAStandaloneFramework::Instance() } AliHLTTPCCAStandaloneFramework::AliHLTTPCCAStandaloneFramework() - : fMerger(), fGPUTracker(), fStatNEvents( 0 ), fUseGPUTracker(false), fGPUDebugLevel(0) + : fMerger(), fTracker(), fStatNEvents( 0 ), fDebugLevel(0) { //* constructor @@ -58,7 +49,7 @@ AliHLTTPCCAStandaloneFramework::AliHLTTPCCAStandaloneFramework() } AliHLTTPCCAStandaloneFramework::AliHLTTPCCAStandaloneFramework( const AliHLTTPCCAStandaloneFramework& ) - : fMerger(), fGPUTracker(), fStatNEvents( 0 ), fUseGPUTracker(false), fGPUDebugLevel(0) + : fMerger(), fTracker(), fStatNEvents( 0 ), fDebugLevel(0) { //* dummy } @@ -114,9 +105,9 @@ void AliHLTTPCCAStandaloneFramework::FinishDataReading() WriteSettings(outfile); outfile.close(); - event_number++; + event_number++;*/ - std::ifstream infile(filename, std::ifstream::binary); + /*std::ifstream infile(filename, std::ifstream::binary); ReadEvent(infile); infile.close();*/ @@ -127,7 +118,7 @@ void AliHLTTPCCAStandaloneFramework::FinishDataReading() //int -void AliHLTTPCCAStandaloneFramework::ProcessEvent() +void AliHLTTPCCAStandaloneFramework::ProcessEvent(int forceSingleSlice) { // perform the event reconstruction @@ -136,36 +127,38 @@ void AliHLTTPCCAStandaloneFramework::ProcessEvent() TStopwatch timer0; TStopwatch timer1; - if (!fUseGPUTracker || fGPUDebugLevel >= 3) +#ifdef HLTCA_STANDALONE + unsigned long long int startTime, endTime, checkTime; + unsigned long long int cpuTimers[16], gpuTimers[16], tmpFreq; + StandaloneQueryFreq(&tmpFreq); + StandaloneQueryTime(&startTime); +#endif + + if (forceSingleSlice != -1) { - for ( int iSlice = 0; iSlice < fgkNSlices; iSlice++ ) { - fSliceTrackers[iSlice].ReadEvent( &( fClusterData[iSlice] ) ); - fSliceTrackers[iSlice].Reconstruct(); - } - if (fGPUDebugLevel >= 2) printf("\n"); + if (fTracker.ProcessSlices(forceSingleSlice, 1, &fClusterData[forceSingleSlice], &fSliceOutput[forceSingleSlice])) return; } - - if (fUseGPUTracker) + else { - for ( int iSlice = 0; iSlice < fgkNSlices; iSlice++ ) { - fSliceTrackers[iSlice].ReadEvent( &( fClusterData[iSlice] ) ); - if (fGPUTracker.Reconstruct(&fSliceTrackers[iSlice])) - { - printf("Error during GPU Reconstruction!!!\n"); - //return(1); - } - } - if (fGPUDebugLevel >= 2) printf("\n"); + for (int iSlice = 0;iSlice < fgkNSlices;iSlice += fTracker.MaxSliceCount()) + { + if (fTracker.ProcessSlices(iSlice, fTracker.MaxSliceCount(), &fClusterData[iSlice], &fSliceOutput[iSlice])) return; + } } +#ifdef HLTCA_STANDALONE + StandaloneQueryTime(&endTime); + StandaloneQueryTime(&checkTime); +#endif + timer1.Stop(); TStopwatch timer2; fMerger.Clear(); - fMerger.SetSliceParam( fSliceTrackers[0].Param() ); + fMerger.SetSliceParam( fTracker.Param(0) ); for ( int i = 0; i < fgkNSlices; i++ ) { - fMerger.SetSliceData( i, fSliceTrackers[i].Output() ); + fMerger.SetSliceData( i, &fSliceOutput[i] ); } fMerger.Reconstruct(); @@ -177,6 +170,47 @@ void AliHLTTPCCAStandaloneFramework::ProcessEvent() fLastTime[1] = timer1.CpuTime(); fLastTime[2] = timer2.CpuTime(); +#ifdef HLTCA_STANDALONE + printf("Tracking Time: %lld us\nTime uncertainty: %lld ns\n", (endTime - startTime) * 1000000 / tmpFreq, (checkTime - endTime) * 1000000000 / tmpFreq); + + if (fDebugLevel >= 1) + { + const char* tmpNames[16] = {"Initialisation", "Neighbours Finder", "Neighbours Cleaner", "Starts Hits Finder", "Start Hits Sorter", "Weight Cleaner", "Reserved", "Tracklet Constructor", "Tracklet Selector", "Write Output", "Unused", "Unused", "Unused", "Unused", "Unused", "Unused"}; + + for (int i = 0;i < 10;i++) + { + if (i == 6) continue; + cpuTimers[i] = gpuTimers[i] = 0; + for ( int iSlice = 0; iSlice < fgkNSlices;iSlice++) + { + if (forceSingleSlice != -1) iSlice = forceSingleSlice; + cpuTimers[i] += *fTracker.PerfTimer(0, iSlice, i + 1) - *fTracker.PerfTimer(0, iSlice, i); + if (forceSingleSlice != -1 || (fTracker.MaxSliceCount() && (iSlice % fTracker.MaxSliceCount() == 0 || i <= 5))) + gpuTimers[i] += *fTracker.PerfTimer(1, iSlice, i + 1) - *fTracker.PerfTimer(1, iSlice, i); + if (forceSingleSlice != -1) break; + } + if (forceSingleSlice == -1) + { + cpuTimers[i] /= fgkNSlices; + gpuTimers[i] /= fgkNSlices; + } + cpuTimers[i] *= 1000000; + gpuTimers[i] *= 1000000; + cpuTimers[i] /= tmpFreq; + gpuTimers[i] /= tmpFreq; + cpuTimers[i] /= omp_get_max_threads(); + + printf("Execution Time: Task: %20s ", tmpNames[i]); + printf("CPU: %15lld\t\t", cpuTimers[i]); + printf("GPU: %15lld\t\t", gpuTimers[i]); + if (fDebugLevel >=6 && gpuTimers[i]) + printf("Speedup: %4lld%%", cpuTimers[i] * 100 / gpuTimers[i]); + printf("\n"); + } + printf("Execution Time: Task: %20s CPU: %15lld\n", "Merger", (long long int) (timer2.CpuTime() * 1000000)); + } +#endif + for ( int i = 0; i < 3; i++ ) fStatTime[i] += fLastTime[i]; //return(0); @@ -188,7 +222,7 @@ void AliHLTTPCCAStandaloneFramework::WriteSettings( std::ostream &out ) const //* write settings to the file out << NSlices() << std::endl; for ( int iSlice = 0; iSlice < NSlices(); iSlice++ ) { - fSliceTrackers[iSlice].Param().WriteSettings( out ); + fTracker.Param(iSlice).WriteSettings( out ); } } @@ -200,7 +234,7 @@ void AliHLTTPCCAStandaloneFramework::ReadSettings( std::istream &in ) for ( int iSlice = 0; iSlice < nSlices; iSlice++ ) { AliHLTTPCCAParam param; param.ReadSettings ( in ); - fSliceTrackers[iSlice].Initialize( param ); + fTracker.InitializeSliceParam(iSlice, param); } } @@ -237,28 +271,4 @@ void AliHLTTPCCAStandaloneFramework::ReadTracks( std::istream &in ) fStatTime[i] += fLastTime[i]; } //fMerger.Output()->Read( in ); -} - -int AliHLTTPCCAStandaloneFramework::InitGPU() -{ - if (fUseGPUTracker) return(1); - int retVal = fGPUTracker.InitGPU(); - fUseGPUTracker = retVal == 0; - return(retVal); -} - -int AliHLTTPCCAStandaloneFramework::ExitGPU() -{ - if (!fUseGPUTracker) return(1); - return(fGPUTracker.ExitGPU()); -} - -void AliHLTTPCCAStandaloneFramework::SetGPUDebugLevel(int Level, std::ostream *OutFile, std::ostream *GPUOutFile) -{ - fGPUTracker.SetDebugLevel(Level, GPUOutFile); - fGPUDebugLevel = Level; - for (int i = 0;i < fgkNSlices;i++) - { - fSliceTrackers[i].SetGPUDebugLevel(Level, OutFile); - } -} +} \ No newline at end of file diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStandaloneFramework.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStandaloneFramework.h index 175a41f6bd1..c9b5edc6551 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStandaloneFramework.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStandaloneFramework.h @@ -10,12 +10,12 @@ #define ALIHLTTPCCASTANDALONEFRAMEWORK_H #include "AliHLTTPCCADef.h" -#include "AliHLTTPCCATracker.h" #include "AliHLTTPCCAMerger.h" #include "AliHLTTPCCAClusterData.h" -#include "AliHLTTPCCAGPUTracker.h" +#include "AliHLTTPCCATrackerFramework.h" #include #include +#include "TStopwatch.h" /** * @class AliHLTTPCCAStandaloneFramework @@ -35,7 +35,9 @@ class AliHLTTPCCAStandaloneFramework static AliHLTTPCCAStandaloneFramework &Instance(); - AliHLTTPCCATracker &SliceTracker( int iSlice ) { return fSliceTrackers[iSlice]; } + const AliHLTTPCCAParam &Param ( int iSlice ) const { return(fTracker.Param(iSlice)); } + const AliHLTTPCCARow &Row ( int iSlice, int iRow ) const { return(fTracker.Row(iSlice, iRow)); } + const AliHLTTPCCASliceOutput &Output( int iSlice ) const { return fSliceOutput[iSlice]; } AliHLTTPCCAMerger &Merger() { return fMerger; } AliHLTTPCCAClusterData &ClusterData( int iSlice ) { return fClusterData[iSlice]; } @@ -59,7 +61,7 @@ class AliHLTTPCCAStandaloneFramework /** * perform event reconstruction */ - void ProcessEvent(); + void ProcessEvent(int forceSingleSlice = -1); int NSlices() const { return fgkNSlices; } @@ -76,9 +78,19 @@ class AliHLTTPCCAStandaloneFramework void ReadEvent( std::istream &in ); void ReadTracks( std::istream &in ); - int InitGPU(); - int ExitGPU(); - void SetGPUDebugLevel(int Level, std::ostream *OutFile = NULL, std::ostream *GPUOutFile = NULL); + int InitGPU(int sliceCount = 1, int forceDeviceID = -1) { return(fTracker.InitGPU(sliceCount, forceDeviceID)); } + int ExitGPU() { return(fTracker.ExitGPU()); } + void SetGPUDebugLevel(int Level, std::ostream *OutFile = NULL, std::ostream *GPUOutFile = NULL) { fDebugLevel = Level; fTracker.SetGPUDebugLevel(Level, OutFile, GPUOutFile); } + int SetGPUTrackerOption(char* OptionName, int OptionValue) {return(fTracker.SetGPUTrackerOption(OptionName, OptionValue));} + int SetGPUTracker(bool enable) { return(fTracker.SetGPUTracker(enable)); } + int GetGPUStatus() { return(fTracker.GetGPUStatus()); } + + int InitializeSliceParam(int iSlice, AliHLTTPCCAParam& param) { return(fTracker.InitializeSliceParam(iSlice, param)); } + +#ifdef HLTCA_STANDALONE + static inline void StandaloneQueryTime(unsigned long long int *i); + static inline void StandaloneQueryFreq(unsigned long long int *i); +#endif private: @@ -87,18 +99,39 @@ class AliHLTTPCCAStandaloneFramework AliHLTTPCCAStandaloneFramework( const AliHLTTPCCAStandaloneFramework& ); const AliHLTTPCCAStandaloneFramework &operator=( const AliHLTTPCCAStandaloneFramework& ) const; - AliHLTTPCCATracker fSliceTrackers[fgkNSlices]; //* array of slice trackers AliHLTTPCCAMerger fMerger; //* global merger AliHLTTPCCAClusterData fClusterData[fgkNSlices]; + AliHLTTPCCASliceOutput fSliceOutput[fgkNSlices]; - AliHLTTPCCAGPUTracker fGPUTracker; + AliHLTTPCCATrackerFramework fTracker; double fLastTime[20]; //* timers double fStatTime[20]; //* timers int fStatNEvents; //* n events proceed - bool fUseGPUTracker; // use the GPU tracker - int fGPUDebugLevel; // debug level for the GPU code + int fDebugLevel; }; +#ifdef HLTCA_STANDALONE + void AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(unsigned long long int *i) + { + #ifdef R__WIN32 + QueryPerformanceCounter((LARGE_INTEGER*) i); + #else + timespec t; + clock_gettime(CLOCK_REALTIME, &t); + *i = (unsigned long long int) t.tv_sec * (unsigned long long int) 1000000000 + (unsigned long long int) t.tv_nsec; + #endif + } + + void AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(unsigned long long int *i) + { + #ifdef R__WIN32 + QueryPerformanceFrequency((LARGE_INTEGER*) i); + #else + *i = 1000000000; + #endif + } +#endif + #endif diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsFinder.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsFinder.cxx index 85134eb5a19..cbf350d629f 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsFinder.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsFinder.cxx @@ -29,31 +29,46 @@ GPUd() void AliHLTTPCCAStartHitsFinder::Thread if ( iSync == 0 ) { if ( iThread == 0 ) { - if ( iBlock == 0 ) { - CAMath::AtomicExch( tracker.NTracklets(), 0 ); - } s.fNRows = tracker.Param().NRows(); s.fIRow = iBlock + 1; s.fNRowStartHits = 0; if ( s.fIRow <= s.fNRows - 4 ) { s.fNHits = tracker.Row( s.fIRow ).NHits(); - if ( s.fNHits >= 10240 ) s.fNHits = 10230; + if ( s.fNHits >= ALIHLTTPCCASTARTHITSFINDER_MAX_FROWSTARTHITS ) s.fNHits = ALIHLTTPCCASTARTHITSFINDER_MAX_FROWSTARTHITS - 1; } else s.fNHits = -1; } } else if ( iSync == 1 ) { const AliHLTTPCCARow &row = tracker.Row( s.fIRow ); + const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRow + 2 ); for ( int ih = iThread; ih < s.fNHits; ih += nThreads ) { - if ( ( tracker.HitLinkDownData( row, ih ) < 0 ) && ( tracker.HitLinkUpData( row, ih ) >= 0 ) ) { + if (tracker.HitLinkDownData(row, ih) < 0 && tracker.HitLinkUpData(row, ih) >= 0 && tracker.HitLinkUpData(rowUp, tracker.HitLinkUpData(row, ih)) >= 0) { int oldNRowStartHits = CAMath::AtomicAdd( &s.fNRowStartHits, 1 ); s.fRowStartHits[oldNRowStartHits].Set( s.fIRow, ih ); } } } else if ( iSync == 2 ) { if ( iThread == 0 ) { - s.fNOldStartHits = CAMath::AtomicAdd( tracker.NTracklets(), s.fNRowStartHits ); + int nOffset = CAMath::AtomicAdd( tracker.NTracklets(), s.fNRowStartHits ); +#ifdef HLTCA_GPUCODE + if (nOffset + s.fNRowStartHits >= HLTCA_GPU_MAX_TRACKLETS) + { + tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_TRACKLET_OVERFLOW; + CAMath::AtomicExch( tracker.NTracklets(), 0 ); + nOffset = 0; + } +#endif + s.fNOldStartHits = nOffset; +#ifdef HLTCA_GPU_SORT_STARTHITS + tracker.RowStartHitCountOffset()[s.fIRow].x = s.fNRowStartHits; + tracker.RowStartHitCountOffset()[s.fIRow].y = nOffset; +#endif } } else if ( iSync == 3 ) { +#ifdef HLTCA_GPU_SORT_STARTHITS + AliHLTTPCCAHitId *const startHits = tracker.TrackletTmpStartHits(); +#else AliHLTTPCCAHitId *const startHits = tracker.TrackletStartHits(); +#endif for ( int ish = iThread; ish < s.fNRowStartHits; ish += nThreads ) { startHits[s.fNOldStartHits+ish] = s.fRowStartHits[ish]; } diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsFinder.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsFinder.h index 709278d60b5..1facf3c8f7e 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsFinder.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsFinder.h @@ -34,10 +34,7 @@ class AliHLTTPCCAStartHitsFinder AliHLTTPCCASharedMemory& operator=( const AliHLTTPCCASharedMemory& /*dummy*/ ) { return *this; } #endif -#ifndef CUDA_DEVICE_EMULATION protected: -#endif - int fIRow; // row index int fNRows; // n rows int fNHits; // n hits in the row diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.cxx new file mode 100644 index 00000000000..c8a3498df7a --- /dev/null +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.cxx @@ -0,0 +1,111 @@ +// @(#) $Id: AliHLTTPCCAStartHitsFinder.cxx 27042 2008-07-02 12:06:02Z richterm $ +// ************************************************************************** +// This file is property of and copyright by the ALICE HLT Project * +// ALICE Experiment at CERN, All rights reserved. * +// * +// Primary Authors: David Rohr * +// for The ALICE HLT Project. * +// * +// Permission to use, copy, modify and distribute this software and its * +// documentation strictly for non-commercial purposes is hereby granted * +// without fee, provided that the above copyright notice appears in all * +// copies and that both the copyright notice and this permission notice * +// appear in the supporting documentation. The authors make no claims * +// about the suitability of this software for any purpose. It is * +// provided "as is" without express or implied warranty. * +// * +//*************************************************************************** + +#include "AliHLTTPCCAStartHitsSorter.h" +#include "AliHLTTPCCATracker.h" + +GPUd() void AliHLTTPCCAStartHitsSorter::Thread +( int nBlocks, int nThreads, int iBlock, int iThread, int iSync, + AliHLTTPCCASharedMemory &s, AliHLTTPCCATracker &tracker ) +{ + //Sorts the Start Hits by Row Index and create RowBlock Data + if ( iSync == 0 ) { + if ( iThread == 0 ) { + const int gpuFixedBlockCount = tracker.GPUParametersConst()->fGPUFixedBlockCount; + const int tmpNRows = tracker.Param().NRows() - 6; + int nRows = iBlock == 29 ? (tmpNRows - (tmpNRows / 30) * 29) : (tmpNRows / 30); + int nStartRow = (tmpNRows / 30) * iBlock + 1; + int startOffset = 0; + int startOffset2 = 0; + int previousBlockEndTracklet = 0; + int nCurrentBlock = 0; + + for (int ir = 1;ir < tracker.Param().NRows() - 5;ir++) + { + if (ir < nStartRow) + startOffset2 += tracker.RowStartHitCountOffset()[ir].x; + + if (iBlock == nBlocks - 1 && nCurrentBlock < gpuFixedBlockCount) + { + startOffset += tracker.RowStartHitCountOffset()[ir].x; + for (int i = previousBlockEndTracklet + HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS;i <= startOffset;i += HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS) + { + if (previousBlockEndTracklet / (HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS) != i / (HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS)) + { + tracker.BlockStartingTracklet()[nCurrentBlock].x = previousBlockEndTracklet; + tracker.BlockStartingTracklet()[nCurrentBlock++].y = HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS; + previousBlockEndTracklet += HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS; + if (nCurrentBlock == gpuFixedBlockCount) + { + break; + } + } + } + if ((ir + 1) % HLTCA_GPU_SCHED_ROW_STEP == 0 && nCurrentBlock < gpuFixedBlockCount) + { + if (previousBlockEndTracklet != startOffset) + { + tracker.BlockStartingTracklet()[nCurrentBlock].x = previousBlockEndTracklet; + tracker.BlockStartingTracklet()[nCurrentBlock++].y = startOffset - previousBlockEndTracklet; + previousBlockEndTracklet = startOffset; + } + } + if (nCurrentBlock == gpuFixedBlockCount) + { + tracker.GPUParameters()->fScheduleFirstDynamicTracklet = previousBlockEndTracklet; + } + } + } + if (iBlock == nBlocks - 1) + { + if (nCurrentBlock < gpuFixedBlockCount) + { + tracker.BlockStartingTracklet()[nCurrentBlock].x = previousBlockEndTracklet; + tracker.BlockStartingTracklet()[nCurrentBlock++].y = startOffset - previousBlockEndTracklet; + tracker.GPUParameters()->fScheduleFirstDynamicTracklet = startOffset; + } + for (int i = nCurrentBlock;i < HLTCA_GPU_BLOCK_COUNT;i++) + { + tracker.BlockStartingTracklet()[i].x = 0; + tracker.BlockStartingTracklet()[i].y = 0; + } + } + s.fStartOffset = startOffset2; + s.fNRows = nRows; + s.fStartRow = nStartRow; + } + } else if ( iSync == 1 ) { + int startOffset = s.fStartOffset; + for (int ir = 0;ir < s.fNRows;ir++) + { + AliHLTTPCCAHitId *const startHits = tracker.TrackletStartHits(); + AliHLTTPCCAHitId *const tmpStartHits = tracker.TrackletTmpStartHits(); + const int tmpLen = tracker.RowStartHitCountOffset()[ir + s.fStartRow].x; //Length of hits in row stored by StartHitsFinder + const int tmpOffset = tracker.RowStartHitCountOffset()[ir + s.fStartRow].y; //Offset of first hit in row of unsorted array by StartHitsFinder + if (iThread == 0) + tracker.RowStartHitCountOffset()[ir + s.fStartRow].z = startOffset; //Store New Offset Value of sorted array + + for (int j = iThread;j < tmpLen;j += nThreads) + { + startHits[startOffset + j] = tmpStartHits[tmpOffset + j]; + } + startOffset += tmpLen; + } + } +} + diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.h new file mode 100644 index 00000000000..0969c26af71 --- /dev/null +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.h @@ -0,0 +1,50 @@ +//-*- Mode: C++ -*- +// ************************************************************************ +// This file is property of and copyright by the ALICE HLT Project * +// ALICE Experiment at CERN, All rights reserved. * +// See cxx source for full Copyright notice * +// * +//************************************************************************* + +#ifndef ALIHLTTPCCASTARTHITSSORTER_H +#define ALIHLTTPCCASTARTHITSSORTER_H + +#include "AliHLTTPCCADef.h" +#include "AliHLTTPCCAHitId.h" + +class AliHLTTPCCATracker; + +/** + * @class AliHLTTPCCAStartHitsSorter + * + */ +class AliHLTTPCCAStartHitsSorter +{ + public: + class AliHLTTPCCASharedMemory + { + friend class AliHLTTPCCAStartHitsSorter; + public: +#if !defined(HLTCA_GPUCODE) + AliHLTTPCCASharedMemory() + : fStartRow( 0 ), fNRows( 0 ), fStartOffset( 0 ) {} + + AliHLTTPCCASharedMemory( const AliHLTTPCCASharedMemory& /*dummy*/ ) + : fStartRow( 0 ), fNRows( 0 ), fStartOffset( 0 ) {} + AliHLTTPCCASharedMemory& operator=( const AliHLTTPCCASharedMemory& /*dummy*/ ) { return *this; } +#endif + + protected: + int fStartRow; // start row index + int fNRows; // number of rows to process + int fStartOffset; //start offset for hits sorted by this block + }; + + GPUd() static int NThreadSyncPoints() { return 1; } + + GPUd() static void Thread( int nBlocks, int nThreads, int iBlock, int iThread, int iSync, + AliHLTTPCCASharedMemory &smem, AliHLTTPCCATracker &tracker ); +}; + + +#endif diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrack.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrack.h index 714e974fc9a..d81fcc4f1ca 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrack.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrack.h @@ -38,10 +38,7 @@ class AliHLTTPCCATrack GPUhd() void SetFirstHitID( int v ) { fFirstHitID = v; } GPUhd() void SetParam( AliHLTTPCCATrackParam v ) { fParam = v; }; -#ifndef CUDA_DEVICE_EMULATION private: -#endif - bool fAlive; // flag for mark tracks used by the track merger int fFirstHitID; // index of the first track cell in the track->cell pointer array int fNHits; // number of track cells diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackLinearisation.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackLinearisation.h index 1656f8cadd3..365cf2cd65c 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackLinearisation.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackLinearisation.h @@ -57,10 +57,7 @@ class AliHLTTPCCATrackLinearisation GPUd() void SetDzDs( float v ) { fDzDs = v; } GPUd() void SetQPt( float v ) { fQPt = v; } -#ifndef CUDA_DEVICE_EMULATION private: -#endif - float fSinPhi; // SinPhi float fCosPhi; // CosPhi float fDzDs; // DzDs diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackParam.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackParam.h index 74f5bbe88a9..4e84d3fbe9f 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackParam.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackParam.h @@ -71,6 +71,7 @@ class AliHLTTPCCATrackParam const float *GetPar() const { return fP; } const float *GetCov() const { return fC; } + GPUd() float GetCov(int i) {return fC[i]; } GPUhd() void SetPar( int i, float v ) { fP[i] = v; } GPUhd() void SetCov( int i, float v ) { fC[i] = v; } @@ -136,10 +137,10 @@ class AliHLTTPCCATrackParam GPUd() void Print() const; -#ifndef CUDA_DEVICE_EMULATION private: -#endif - + //WARNING, Track Param Data is copied in the GPU Tracklet Constructor element by element instead of using copy constructor!!! + //This is neccessary for performance reasons!!! + //Changes to Elements of this class therefore must also be applied to TrackletConstructor!!! float fX; // x position float fSignCosPhi; // sign of cosPhi float fP[5]; // 'active' track parameters: Y, Z, SinPhi, DzDs, q/Pt diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx index b745fe90447..a2e442c944c 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx @@ -42,8 +42,12 @@ #include "AliHLTTPCCATrackParam.h" +#include "AliHLTTPCCAGPUConfig.h" +#include "AliHLTTPCCAGPUTracker.h" + #if !defined(HLTCA_GPUCODE) #include +#include #endif //#define DRAW1 @@ -56,23 +60,29 @@ #include "AliHLTTPCCAPerformance.h" #endif +#ifdef HLTCA_STANDALONE +#include "AliHLTTPCCAStandaloneFramework.h" +#endif ClassImp( AliHLTTPCCATracker ) -GPUd() AliHLTTPCCATracker::~AliHLTTPCCATracker() + +#if !defined(HLTCA_GPUCODE) + +AliHLTTPCCATracker::~AliHLTTPCCATracker() { // destructor if (!fIsGPUTracker) { - delete[] fCommonMemory; - delete[] fHitMemory; - delete[] fTrackMemory; + if (fCommonMem) delete fCommonMem; + if (fHitMemory) delete[] fHitMemory; + if (fTrackletMemory) delete[] fTrackletMemory; + if (fTrackMemory) delete[] fTrackMemory; + fCommonMem = NULL; + fHitMemory = fTrackMemory = NULL; } } -#if !defined(HLTCA_GPUCODE) - - // ---------------------------------------------------------------------------------- void AliHLTTPCCATracker::Initialize( const AliHLTTPCCAParam ¶m ) { @@ -94,148 +104,236 @@ void AliHLTTPCCATracker::StartEvent() void AliHLTTPCCATracker::SetGPUTracker() { fIsGPUTracker = true; + fData.SetGpuSliceData(); } char* AliHLTTPCCATracker::SetGPUTrackerCommonMemory(char* pGPUMemory) { - fCommonMemory = (char*) pGPUMemory; - SetPointersCommon(); - return(pGPUMemory + fCommonMemorySize); + fCommonMem = (commonMemoryStruct*) pGPUMemory; + return(pGPUMemory + sizeof(commonMemoryStruct)); } -char* AliHLTTPCCATracker::SetGPUTrackerHitsMemory(char* pGPUMemory, int MaxNHits ) +char* AliHLTTPCCATracker::SetGPUTrackerHitsMemory(char* pGPUMemory, int MaxNHits) { fHitMemory = (char*) pGPUMemory; SetPointersHits(MaxNHits); - return(pGPUMemory + fHitMemorySize); + pGPUMemory += fHitMemorySize; + AssignMemory(fTrackletTmpStartHits, pGPUMemory, NHitsTotal()); + AssignMemory(fRowStartHitCountOffset, pGPUMemory, Param().NRows()); + + return(pGPUMemory); } +char* AliHLTTPCCATracker::SetGPUTrackerTrackletsMemory(char* pGPUMemory, int MaxNTracks) +{ + fTrackletMemory = (char*) pGPUMemory; + SetPointersTracklets(MaxNTracks); + pGPUMemory += fTrackletMemorySize; + AssignMemory(fGPUTrackletTemp, pGPUMemory, MaxNTracks); + AssignMemory(fRowBlockTracklets, pGPUMemory, MaxNTracks * 2 * (Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1)); + AssignMemory(fRowBlockPos, pGPUMemory, 2 * (Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1)); + AssignMemory(fBlockStartingTracklet, pGPUMemory, HLTCA_GPU_BLOCK_COUNT); + + return(pGPUMemory); +} char* AliHLTTPCCATracker::SetGPUTrackerTracksMemory(char* pGPUMemory, int MaxNTracks, int MaxNHits ) { fTrackMemory = (char*) pGPUMemory; SetPointersTracks(MaxNTracks, MaxNHits); - return(pGPUMemory + fTrackMemorySize); + pGPUMemory += fTrackMemorySize; + + return(pGPUMemory); } -void AliHLTTPCCATracker::DumpLinks(std::ostream &out) +void AliHLTTPCCATracker::DumpSliceData(std::ostream &out) { + out << "Slice Data:" << endl; for (int i = 0;i < Param().NRows();i++) { out << "Row: " << i << endl; for (int j = 0;j < Row(i).NHits();j++) { - out << HitLinkUpData(Row(i), j) << ", "; + if (j && j % 16 == 0) out << std::endl; + out << j << '-' << Data().HitDataY(Row(i), j) << '-' << Data().HitDataZ(Row(i), j) << ", "; } out << endl; } } -void AliHLTTPCCATracker::DumpStartHits(std::ostream &out) +void AliHLTTPCCATracker::DumpLinks(std::ostream &out) { - for (int j = 0;j < Param().NRows();j++) + out << "Hit Links:" << endl; + for (int i = 0;i < Param().NRows();i++) { - for (int i = 0;i < *NTracklets();i++) + out << "Row: " << i << endl; + for (int j = 0;j < Row(i).NHits();j++) { - if (TrackletStartHit(i).RowIndex() == j) - out << TrackletStartHit(i).RowIndex() << "-" << TrackletStartHit(i).HitIndex() << endl; + if (j && j % 32 == 0) out << endl; + out << HitLinkUpData(Row(i), j) << "/" << HitLinkDownData(Row(i), j) << ", "; } + out << endl; } - out << endl; } -void AliHLTTPCCATracker::DumpTrackHits(std::ostream &out) +void AliHLTTPCCATracker::DumpHitWeights(std::ostream &out) { - for (int k = 0;k < Param().NRows();k++) + out << "Hit Weights:" << endl; + for (int i = 0;i < Param().NRows();i++) + { + out << "Row: " << i << ":" << endl; + for (int j = 0;j < Row(i).NHits();j++) { - for (int j = 0;j < *NTracks();j++) - { - if (Tracks()[j].NHits() == 0 || !Tracks()[j].Alive()) continue; - if (TrackHits()[Tracks()[j].FirstHitID()].RowIndex() == k) - { - for (int i = 0;i < Tracks()[j].NHits();i++) - { - out << TrackHits()[Tracks()[j].FirstHitID() + i].RowIndex() << "-" << TrackHits()[Tracks()[j].FirstHitID() + i].HitIndex() << ", "; - } - out << "(Track: " << j << ")" << endl; - } - } + if (j && j % 32 == 0) out << endl; + out << HitWeight(Row(i), j) << ", "; } + out << endl; + } +} + +int AliHLTTPCCATracker::starthitSortComparison(const void*a, const void* b) +{ + AliHLTTPCCAHitId* aa = (AliHLTTPCCAHitId*) a; + AliHLTTPCCAHitId* bb = (AliHLTTPCCAHitId*) b; + + if (aa->RowIndex() != bb->RowIndex()) return(aa->RowIndex() - bb->RowIndex()); + return(aa->HitIndex() - bb->HitIndex()); } -int trackletSortComparison(const void* a, const void* b) +void AliHLTTPCCATracker::DumpStartHits(std::ostream &out) { - const AliHLTTPCCATracklet* aa = (AliHLTTPCCATracklet*) a; - const AliHLTTPCCATracklet* bb = (AliHLTTPCCATracklet*) b; - if (aa->NHits() == 0) return(-1); - if (bb->NHits() == 0) return(1); - if (aa->FirstRow() != bb->FirstRow()) + out << "Start Hits: (" << *NTracklets() << ")" << endl; +#ifdef HLTCA_GPU_SORT_DUMPDATA + qsort(TrackletStartHits(), *NTracklets(), sizeof(AliHLTTPCCAHitId), starthitSortComparison); +#endif + for (int i = 0;i < *NTracklets();i++) { - return(aa->FirstRow() - bb->FirstRow()); + out << TrackletStartHit(i).RowIndex() << "-" << TrackletStartHit(i).HitIndex() << endl; } - for (int i = aa->FirstRow();i <= aa->LastRow();i++) + out << endl; +} + +void AliHLTTPCCATracker::DumpTrackHits(std::ostream &out) +{ + out << "Tracks: (" << *NTracks() << ")" << endl; +#ifdef HLTCA_GPU_SORT_DUMPDATA + for (int k = 0;k < Param().NRows();k++) { - if (i >= bb->LastRow()) return(-1); - if (aa->RowHit(i) != bb->RowHit(i)) + for (int l = 0;l < Row(k).NHits();l++) { - return(aa->RowHit(i) - bb->RowHit(i)); - } +#endif + for (int j = 0;j < *NTracks();j++) + { + if (Tracks()[j].NHits() == 0 || !Tracks()[j].Alive()) continue; +#ifdef HLTCA_GPU_SORT_DUMPDATA + if (TrackHits()[Tracks()[j].FirstHitID()].RowIndex() == k && TrackHits()[Tracks()[j].FirstHitID()].HitIndex() == l) + { +#endif + for (int i = 0;i < Tracks()[j].NHits();i++) + { + out << TrackHits()[Tracks()[j].FirstHitID() + i].RowIndex() << "-" << TrackHits()[Tracks()[j].FirstHitID() + i].HitIndex() << ", "; + } + out << "(Track: " << j << ")" << endl; +#ifdef HLTCA_GPU_SORT_DUMPDATA + } + } +#endif + } +#ifdef HLTCA_GPU_SORT_DUMPDATA } - return(0); +#endif } void AliHLTTPCCATracker::DumpTrackletHits(std::ostream &out) { - qsort(Tracklets(), *NTracklets(), sizeof(AliHLTTPCCATracklet), trackletSortComparison); - for (int k = 0;k < Param().NRows();k++) + out << "Tracklets: (" << *NTracklets() << ")" << endl; +#ifdef HLTCA_GPU_SORT_DUMPDATA + AliHLTTPCCAHitId* tmpIds = new AliHLTTPCCAHitId[*NTracklets()]; + AliHLTTPCCATracklet* tmpTracklets = new AliHLTTPCCATracklet[*NTracklets()]; + memcpy(tmpIds, TrackletStartHits(), *NTracklets() * sizeof(AliHLTTPCCAHitId)); + memcpy(tmpTracklets, Tracklets(), *NTracklets() * sizeof(AliHLTTPCCATracklet)); +#ifdef EXTERN_ROW_HITS + int* tmpHits = new int[*NTracklets() * Param().NRows()]; + memcpy(tmpHits, TrackletRowHits(), *NTracklets() * Param().NRows() * sizeof(int)); +#endif + qsort(TrackletStartHits(), *NTracklets(), sizeof(AliHLTTPCCAHitId), starthitSortComparison); + for (int i = 0;i < *NTracklets();i++) { for (int j = 0;j < *NTracklets();j++) { - if (Tracklets()[j].NHits() == 0) continue; - if (Tracklets()[j].LastRow() > Tracklets()[j].FirstRow() && (Tracklets()[j].FirstRow() >= Param().NRows() || Tracklets()[j].LastRow() >= Param().NRows())) - { - printf("\nError: First %d Last %d Num %d", Tracklets()[j].FirstRow(), Tracklets()[j].LastRow(), Tracklets()[j].NHits()); - } - else if (Tracklets()[j].NHits() && Tracklets()[j].FirstRow() == k && Tracklets()[j].LastRow() > Tracklets()[j].FirstRow()) + if (tmpIds[i].RowIndex() == TrackletStartHit(j).RowIndex() && tmpIds[i].HitIndex() == TrackletStartHit(j).HitIndex()) { - for (int i = Tracklets()[j].FirstRow();i <= Tracklets()[j].LastRow();i++) + memcpy(&Tracklets()[j], &tmpTracklets[i], sizeof(AliHLTTPCCATracklet)); +#ifdef EXTERN_ROW_HITS + if (tmpTracklets[i].NHits()) { - if (Tracklets()[j].RowHit(i) != -1) - out << i << "-" << Tracklets()[j].RowHit(i) << ", "; + for (int k = tmpTracklets[i].FirstRow();k <= tmpTracklets[i].LastRow();k++) + { + fTrackletRowHits[k * *NTracklets() + j] = tmpHits[k * *NTracklets() + i]; + } } - out << endl; +#endif + break; + } + } + } + delete[] tmpIds; + delete[] tmpTracklets; +#ifdef EXTERN_ROW_HITS + delete[] tmpHits; +#endif +#endif + for (int j = 0;j < *NTracklets();j++) + { + out << "Tracklet " << j << " (Hits: " << setw(3) << Tracklets()[j].NHits() << ", Start: " << setw(3) << TrackletStartHit(j).RowIndex() << "-" << setw(3) << TrackletStartHit(j).HitIndex() << ") "; + if (Tracklets()[j].NHits() == 0); + else if (Tracklets()[j].LastRow() > Tracklets()[j].FirstRow() && (Tracklets()[j].FirstRow() >= Param().NRows() || Tracklets()[j].LastRow() >= Param().NRows())) + { +#ifdef HLTCA_STANDALONE + printf("\nError: First %d Last %d Num %d", Tracklets()[j].FirstRow(), Tracklets()[j].LastRow(), Tracklets()[j].NHits()); +#endif + } + else if (Tracklets()[j].NHits() && Tracklets()[j].LastRow() > Tracklets()[j].FirstRow()) + { + for (int i = Tracklets()[j].FirstRow();i <= Tracklets()[j].LastRow();i++) + { + //if (Tracklets()[j].RowHit(i) != -1) +#ifdef EXTERN_ROW_HITS + out << i << "-" << fTrackletRowHits[i * fCommonMem->fNTracklets + j] << ", "; +#else + out << i << "-" << Tracklets()[j].RowHit(i) << ", "; +#endif } } + out << endl; } } -void AliHLTTPCCATracker::SetupCommonMemory() +void AliHLTTPCCATracker::SetupCommonMemory() { // set up common memory if (!fIsGPUTracker) { - if ( !fCommonMemory ) { - SetPointersCommon(); // just to calculate the size + if ( !fCommonMem ) { // the 1600 extra bytes are not used unless fCommonMemorySize increases with a later event - fCommonMemory = reinterpret_cast ( new uint4 [ fCommonMemorySize/sizeof( uint4 ) + 100] ); - SetPointersCommon();// set pointers + //fCommonMemory = reinterpret_cast ( new uint4 [ fCommonMemorySize/sizeof( uint4 ) + 100] ); + fCommonMem = new commonMemoryStruct; } - delete[] fHitMemory; - delete[] fTrackMemory; - fHitMemory = 0; - fTrackMemory = 0; + if (fHitMemory) delete[] fHitMemory; + if (fTrackletMemory) delete[] fTrackletMemory; + if (fTrackMemory) delete[] fTrackMemory; } + fHitMemory = fTrackletMemory = fTrackMemory = 0; + fData.Clear(); - *fNTracklets = 0; - *fNTracks = 0 ; - *fNTrackHits = 0; - *fNOutTracks = 0; - *fNOutTrackHits = 0; + fCommonMem->fNTracklets = 0; + fCommonMem->fNTracks = 0 ; + fCommonMem->fNTrackHits = 0; } void AliHLTTPCCATracker::ReadEvent( AliHLTTPCCAClusterData *clusterData ) @@ -248,36 +346,16 @@ void AliHLTTPCCATracker::ReadEvent( AliHLTTPCCAClusterData *clusterData ) //* Convert input hits, create grids, etc. fData.InitFromClusterData( *clusterData ); - { - SetPointersHits( fData.NumberOfHits() ); // to calculate the size - fHitMemory = reinterpret_cast ( new uint4 [ fHitMemorySize/sizeof( uint4 ) + 100] ); + if (!fIsGPUTracker) + { + SetPointersHits( fData.NumberOfHits() ); // to calculate the size + fHitMemory = reinterpret_cast ( new uint4 [ fHitMemorySize/sizeof( uint4 ) + 100] ); + } SetPointersHits( fData.NumberOfHits() ); // set pointers for hits - *fNTracklets = 0; - *fNTracks = 0 ; - *fNOutTracks = 0; - *fNOutTrackHits = 0; } } - -GPUhd() void AliHLTTPCCATracker::SetPointersCommon() -{ - // set all pointers to the event memory - - char *mem = fCommonMemory; - AssignMemory( fNTracklets, mem, 1 ); - AssignMemory( fNTracks, mem, 1 ); - AssignMemory( fNTrackHits, mem, 1 ); - AssignMemory( fNOutTracks, mem, 1 ); - AssignMemory( fNOutTrackHits, mem, 1 ); - - // calculate the size - - fCommonMemorySize = mem - fCommonMemory; -} - - GPUhd() void AliHLTTPCCATracker::SetPointersHits( int MaxNHits ) { // set all pointers to the event memory @@ -286,43 +364,42 @@ GPUhd() void AliHLTTPCCATracker::SetPointersHits( int MaxNHits ) // extra arrays for tpc clusters - AssignMemory( fTrackletStartHits, mem, MaxNHits ); +#ifdef HLTCA_GPU_SORT_STARTHITS_2 + AssignMemory( fTrackletStartHits, mem, MaxNHits + 32); +#else + AssignMemory( fTrackletStartHits, mem, MaxNHits); +#endif + + // calculate the size - // arrays for track hits + fHitMemorySize = mem - fHitMemory; +} - AssignMemory( fTrackHits, mem, 10 * MaxNHits ); +GPUhd() void AliHLTTPCCATracker::SetPointersTracklets( int MaxNTracklets ) +{ + // set all pointers to the tracklets memory + char *mem = fTrackletMemory; - AssignMemory( fOutTrackHits, mem, 10 * MaxNHits ); + // memory for tracklets - // calculate the size + AssignMemory( fTracklets, mem, MaxNTracklets ); +#ifdef EXTERN_ROW_HITS + AssignMemory( fTrackletRowHits, mem, MaxNTracklets * Param().NRows()); +#endif - fHitMemorySize = mem - fHitMemory; + fTrackletMemorySize = mem - fTrackletMemory; } GPUhd() void AliHLTTPCCATracker::SetPointersTracks( int MaxNTracks, int MaxNHits ) { // set all pointers to the tracks memory - char *mem = fTrackMemory; - // memory for tracklets - - AssignMemory( fTracklets, mem, MaxNTracks ); - // memory for selected tracks AssignMemory( fTracks, mem, MaxNTracks ); - - // memory for output - - AlignTo < sizeof( void * ) > ( mem ); - fOutput = reinterpret_cast( mem ); - mem += AliHLTTPCCASliceOutput::EstimateSize( MaxNTracks, MaxNHits ); - - // memory for output tracks - - AssignMemory( fOutTracks, mem, MaxNTracks ); + AssignMemory( fTrackHits, mem, 2 * MaxNHits ); // calculate the size @@ -362,14 +439,26 @@ void AliHLTTPCCATracker::RunStartHitsFinder() void AliHLTTPCCATracker::RunTrackletConstructor() { - AliHLTTPCCAProcess1( 1, TRACKLET_CONSTRUCTOR_NMEMTHREDS + *fNTracklets, *this ); + AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorNewCPU(*this); } void AliHLTTPCCATracker::RunTrackletSelector() { - AliHLTTPCCAProcess( 1, *fNTracklets, *this ); + AliHLTTPCCAProcess( 1, fCommonMem->fNTracklets, *this ); } +#ifdef HLTCA_STANDALONE +void AliHLTTPCCATracker::StandalonePerfTime(int i) +{ + if (fGPUDebugLevel >= 1) + { + AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&fPerfTimers[i]); + } +} +#else +void AliHLTTPCCATracker::StandalonePerfTime(int /*i*/) {} +#endif + GPUh() void AliHLTTPCCATracker::Reconstruct() { //* reconstruction of event @@ -387,6 +476,11 @@ GPUh() void AliHLTTPCCATracker::Reconstruct() //if( fParam.ISlice()<1 ) return; //SG!!! TStopwatch timer0; + + StandalonePerfTime(0); + + fOutput->Clear(); + if (CheckEmptySlice()) return; #ifdef DRAW1 @@ -403,23 +497,25 @@ GPUh() void AliHLTTPCCATracker::Reconstruct() } #endif - *fNTracks = 0; - *fNTracklets = 0; + fCommonMem->fNTracklets = fCommonMem->fNTracks = fCommonMem->fNTrackHits = 0; #if !defined(HLTCA_GPUCODE) - if (fGPUDebugLevel >= 3) + /*if (fGPUDebugLevel >= 6) { *fGPUDebugOut << endl << endl << "Slice: " << Param().ISlice() << endl; - } + *fGPUDebugOut << "Slice Data:" << endl; + DumpSliceData(*fGPUDebugOut); + }*/ + + StandalonePerfTime(1); RunNeighboursFinder(); - if (fGPUDebugLevel >= 3) - { - *fGPUDebugOut << "Neighbours Finder:" << endl; - DumpLinks(*fGPUDebugOut); - } + StandalonePerfTime(2); + + if (fGPUDebugLevel >= 6) DumpLinks(*fGPUDebugOut); + #ifdef HLTCA_INTERNAL_PERFORMANCE //if( Param().ISlice()<=2 ) //AliHLTTPCCAPerformance::Instance().LinkPerformance( Param().ISlice() ); @@ -435,65 +531,66 @@ GPUh() void AliHLTTPCCATracker::Reconstruct() RunNeighboursCleaner(); - if (fGPUDebugLevel >= 3) - { - *fGPUDebugOut << "Neighbours Cleaner:" << endl; - DumpLinks(*fGPUDebugOut); - } + StandalonePerfTime(3); + + if (fGPUDebugLevel >= 6) DumpLinks(*fGPUDebugOut); RunStartHitsFinder(); - if (fGPUDebugLevel >= 3) - { - *fGPUDebugOut << "Start Hits: (" << *fNTracklets << ")" << endl; - DumpStartHits(*fGPUDebugOut); - } - - if (fGPUDebugLevel >= 2) printf("%3d ", *fNTracklets); + StandalonePerfTime(4); + StandalonePerfTime(5); + if (fGPUDebugLevel >= 6) DumpStartHits(*fGPUDebugOut); + fData.ClearHitWeights(); - SetPointersTracks( *fNTracklets * 2, NHitsTotal() ); // to calculate the size + SetPointersTracklets( fCommonMem->fNTracklets * 2 ); // to calculate the size + fTrackletMemory = reinterpret_cast ( new uint4 [ fTrackletMemorySize/sizeof( uint4 ) + 100] ); + SetPointersTracklets( fCommonMem->fNTracklets * 2 ); // set pointers for hits + + SetPointersTracks( fCommonMem->fNTracklets * 2, NHitsTotal() ); // to calculate the size fTrackMemory = reinterpret_cast ( new uint4 [ fTrackMemorySize/sizeof( uint4 ) + 100] ); - SetPointersTracks( *fNTracklets * 2, NHitsTotal() ); // set pointers for hits + SetPointersTracks( fCommonMem->fNTracklets * 2, NHitsTotal() ); // set pointers for hits + + StandalonePerfTime(6); + StandalonePerfTime(7); RunTrackletConstructor(); - if (fGPUDebugLevel >= 3) - { - *fGPUDebugOut << "Tracklet Hits:" << endl; - DumpTrackletHits(*fGPUDebugOut); - } + StandalonePerfTime(8); + + if (fGPUDebugLevel >= 6) DumpTrackletHits(*fGPUDebugOut); + if (fGPUDebugLevel >= 6) DumpHitWeights(*fGPUDebugOut); //std::cout<<"Slice "<= 3) - { - *fGPUDebugOut << "Track Hits: (" << *NTracks() << ")" << endl; - DumpTrackHits(*fGPUDebugOut); - } + if (fGPUDebugLevel >= 6) DumpTrackHits(*fGPUDebugOut); //std::cout<<"Memory used for slice "<= 10*NHitsTotal() ) { + if ( fOutput->NOutTrackHits() >= 10*NHitsTotal() ) { std::cout << "fNOutTrackHits>NHitsTotal()" << std::endl; //exit(0); return;//SG!!! @@ -639,12 +736,12 @@ GPUh() void AliHLTTPCCATracker::WriteOutput() out.SetNHits( out.NHits() + 1 ); } if ( out.NHits() >= 2 ) { - ( *fNOutTracks )++; + fOutput->SetNOutTracks(fOutput->NOutTracks() + 1); } else { - ( *fNOutTrackHits ) = nOutTrackHitsOld; + fOutput->SetNOutTrackHits(nOutTrackHitsOld); } } - +#endif timer.Stop(); fTimers[5] += timer.CpuTime(); @@ -830,16 +927,16 @@ GPUh() void AliHLTTPCCATracker::WriteTracks( std::ostream &out ) //* Write tracks to file out << fTimers[0] << std::endl; - out << *fNOutTrackHits << std::endl; - for ( int ih = 0; ih < *fNOutTrackHits; ih++ ) { - out << fOutTrackHits[ih] << " "; + out << fOutput->NOutTrackHits() << std::endl; + for ( int ih = 0; ih < fOutput->NOutTrackHits(); ih++ ) { + out << fOutput->OutTrackHit(ih) << " "; } out << std::endl; - out << *fNOutTracks << std::endl; + out << fOutput->NOutTracks() << std::endl; - for ( int itr = 0; itr < *fNOutTracks; itr++ ) { - AliHLTTPCCAOutTrack &t = fOutTracks[itr]; + for ( int itr = 0; itr < fOutput->NOutTracks(); itr++ ) { + const AliHLTTPCCAOutTrack &t = fOutput->OutTrack(itr); AliHLTTPCCATrackParam p1 = t.StartPoint(); AliHLTTPCCATrackParam p2 = t.EndPoint(); out << t.NHits() << " "; @@ -868,16 +965,20 @@ GPUh() void AliHLTTPCCATracker::WriteTracks( std::ostream &out ) GPUh() void AliHLTTPCCATracker::ReadTracks( std::istream &in ) { //* Read tracks from file + int tmpval; in >> fTimers[0]; - in >> *fNOutTrackHits; + in >> tmpval; + fOutput->SetNOutTrackHits(tmpval); - for ( int ih = 0; ih < *fNOutTrackHits; ih++ ) { - in >> fOutTrackHits[ih]; + for ( int ih = 0; ih < fOutput->NOutTrackHits(); ih++ ) { + in >> tmpval; + fOutput->SetOutTrackHit(ih, tmpval); } - in >> *fNOutTracks; + in >> tmpval; + fOutput->SetNOutTracks(tmpval); - for ( int itr = 0; itr < *fNOutTracks; itr++ ) { - AliHLTTPCCAOutTrack &t = fOutTracks[itr]; + for ( int itr = 0; itr < fOutput->NOutTracks(); itr++ ) { + AliHLTTPCCAOutTrack &t = fOutput->OutTracks()[itr]; AliHLTTPCCATrackParam p1, p2; int i; float f; diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h index b6ec6338b64..ca2c287f2f7 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h @@ -12,6 +12,7 @@ #include "AliHLTTPCCADef.h" +#include "AliHLTTPCCAGPUConfig.h" #include "AliHLTTPCCAParam.h" #include "AliHLTTPCCARow.h" #include "AliHLTTPCCAHit.h" @@ -21,11 +22,12 @@ #include "AliHLTTPCCASliceData.h" #include "AliHLTTPCCATracklet.h" #include "AliHLTTPCCAOutTrack.h" +#include "AliHLTTPCCASliceOutput.h" +#include "AliHLTTPCCATrackletConstructor.h" class AliHLTTPCCATrack; class AliHLTTPCCATrackParam; class AliHLTTPCCAClusterData; -class AliHLTTPCCASliceOutput; /** * @class AliHLTTPCCATracker @@ -39,41 +41,65 @@ class AliHLTTPCCASliceOutput; * The class is under construction. * */ + +class AliHLTTPCCAClusterData; + class AliHLTTPCCATracker { - friend class AliHLTTPCCAGPUTracker; public: AliHLTTPCCATracker() - : - fParam(), + : fParam(), fClusterData( 0 ), fData(), fIsGPUTracker( false ), fGPUDebugLevel( 0 ), fGPUDebugOut( 0 ), - fCommonMemory( 0 ), - fCommonMemorySize( 0 ), + fRowStartHitCountOffset( NULL ), + fTrackletTmpStartHits( NULL ), + fGPUTrackletTemp( NULL ), + fRowBlockTracklets( NULL ), + fRowBlockPos( NULL ), + fBlockStartingTracklet( NULL ), + fGPUParametersConst(), + fCommonMem( 0 ), fHitMemory( 0 ), fHitMemorySize( 0 ), + fTrackletMemory( 0 ), + fTrackletMemorySize( 0 ), fTrackMemory( 0 ), fTrackMemorySize( 0 ), - fNTracklets( 0 ), fTrackletStartHits( 0 ), fTracklets( 0 ), - fNTracks( 0 ), + fTrackletRowHits( NULL ), fTracks( 0 ), - fNTrackHits( 0 ), fTrackHits( 0 ), - fOutput( 0 ), - fNOutTracks( 0 ), - fOutTracks( 0 ), - fNOutTrackHits( 0 ), - fOutTrackHits( 0 ) + fOutput( 0 ) { // constructor } - GPUd() ~AliHLTTPCCATracker(); + ~AliHLTTPCCATracker(); + + struct StructGPUParameters + { + int fScheduleFirstDynamicTracklet; //Last Tracklet with fixed position in sheduling + int fGPUError; //Signalizes error on GPU during GPU Reconstruction, kind of return value + }; + + struct StructGPUParametersConst + { + int fGPUFixedBlockCount; //Count of blocks that is used for this tracker in fixed schedule situations + int fGPUiSlice; + int fGPUnSlices; + }; + + struct commonMemoryStruct + { + int fNTracklets; // number of tracklets + int fNTracks; // number of reconstructed tracks + int fNTrackHits; // number of track hits + StructGPUParameters fGPUParameters; + }; void Initialize( const AliHLTTPCCAParam ¶m ); @@ -98,12 +124,15 @@ class AliHLTTPCCATracker void SetGPUDebugLevel(int Level, std::ostream *NewDebugOut = NULL) {fGPUDebugLevel = Level;if (NewDebugOut) fGPUDebugOut = NewDebugOut;} char* SetGPUTrackerCommonMemory(char* pGPUMemory); - char* SetGPUTrackerHitsMemory(char* pGPUMemory, int MaxNHits ); + char* SetGPUTrackerHitsMemory(char* pGPUMemory, int MaxNHits); + char* SetGPUTrackerTrackletsMemory(char* pGPUMemory, int MaxNTracklets); char* SetGPUTrackerTracksMemory(char* pGPUMemory, int MaxNTracks, int MaxNHits ); //Debugging Stuff + void DumpSliceData(std::ostream &out); //Dump Input Slice Data void DumpLinks(std::ostream &out); //Dump all links to file (for comparison after NeighboursFinder/Cleaner) void DumpStartHits(std::ostream &out); //Same for Start Hits + void DumpHitWeights(std::ostream &out); //.... void DumpTrackHits(std::ostream &out); //Same for Track Hits void DumpTrackletHits(std::ostream &out); //Same for Track Hits @@ -113,13 +142,16 @@ class AliHLTTPCCATracker void FitTrack( const AliHLTTPCCATrack &track, float *t0 = 0 ) const; void FitTrackFull( const AliHLTTPCCATrack &track, float *t0 = 0 ) const; - void SetPointersCommon(); + void SetupCommonMemory(); void SetPointersHits( int MaxNHits ); + void SetPointersTracklets ( int MaxNTracklets ); void SetPointersTracks( int MaxNTracks, int MaxNHits ); -#if !defined(HLTCA_GPUCODE) + void SetOutput( AliHLTTPCCASliceOutput* out ) { fOutput = out; } + void ReadEvent( AliHLTTPCCAClusterData *clusterData ); +#if !defined(HLTCA_GPUCODE) GPUh() void WriteEvent( std::ostream &out ); GPUh() void WriteTracks( std::ostream &out ) ; GPUh() void ReadTracks( std::istream &in ); @@ -128,8 +160,12 @@ class AliHLTTPCCATracker GPUhd() const AliHLTTPCCAParam &Param() const { return fParam; } GPUhd() void SetParam( const AliHLTTPCCAParam &v ) { fParam = v; } - GPUhd() const AliHLTTPCCAClusterData *ClusterData() const { return fClusterData; } + GPUhd() AliHLTTPCCAClusterData *ClusterData() const { return fClusterData; } GPUhd() const AliHLTTPCCASliceData &Data() const { return fData; } + GPUhd() AliHLTTPCCASliceData *pData() {return &fData; } + + GPUh() void ClearSliceDataHitWeights() {fData.ClearHitWeights();} + GPUhd() const AliHLTTPCCARow &Row( int rowIndex ) const { return fData.Row( rowIndex ); } GPUh() const AliHLTTPCCARow &Row( const AliHLTTPCCAHitId &HitId ) const { return fData.Row( HitId.RowIndex() ); } @@ -143,7 +179,12 @@ class AliHLTTPCCATracker GPUd() short HitLinkUpData( const AliHLTTPCCARow &row, int hitIndex ) const { return fData.HitLinkUpData( row, hitIndex ); } GPUd() short HitLinkDownData( const AliHLTTPCCARow &row, int hitIndex ) const { return fData.HitLinkDownData( row, hitIndex ); } - GPUd() int FirstHitInBin( const AliHLTTPCCARow &row, int binIndex ) const { return fData.FirstHitInBin( row, binIndex ); } + GPUd() const ushort2 *HitData( const AliHLTTPCCARow &row ) const { return fData.HitData(row); } + GPUd() const short_v *HitLinkUpData ( const AliHLTTPCCARow &row ) const { return fData.HitLinkUpData(row); } + GPUd() const short_v *HitLinkDownData( const AliHLTTPCCARow &row ) const { return fData.HitLinkDownData(row); } + GPUd() const ushort_v *FirstHitInBin( const AliHLTTPCCARow &row ) const { return fData.FirstHitInBin(row); } + + GPUd() int FirstHitInBin( const AliHLTTPCCARow &row, int binIndex ) const { return fData.FirstHitInBin( row, binIndex ); } GPUd() unsigned short HitDataY( const AliHLTTPCCARow &row, int hitIndex ) const { return fData.HitDataY( row, hitIndex ); @@ -151,6 +192,9 @@ class AliHLTTPCCATracker GPUd() unsigned short HitDataZ( const AliHLTTPCCARow &row, int hitIndex ) const { return fData.HitDataZ( row, hitIndex ); } + GPUd() ushort2 HitData( const AliHLTTPCCARow &row, int hitIndex ) const { + return fData.HitData( row, hitIndex ); + } GPUhd() int HitInputID( const AliHLTTPCCARow &row, int hitIndex ) const { return fData.ClusterDataIndex( row, hitIndex ); } @@ -172,83 +216,107 @@ class AliHLTTPCCATracker return fData.HitWeight( row, hitIndex ); } - GPUhd() int NTracklets() const { return *fNTracklets; } - GPUhd() int *NTracklets() { return fNTracklets; } + GPUhd() int NTracklets() const { return fCommonMem->fNTracklets; } + GPUhd() int *NTracklets() { return &fCommonMem->fNTracklets; } GPUhd() const AliHLTTPCCAHitId &TrackletStartHit( int i ) const { return fTrackletStartHits[i]; } GPUhd() AliHLTTPCCAHitId *TrackletStartHits() const { return fTrackletStartHits; } + GPUhd() AliHLTTPCCAHitId *TrackletTmpStartHits() const { return fTrackletTmpStartHits; } GPUhd() const AliHLTTPCCATracklet &Tracklet( int i ) const { return fTracklets[i]; } GPUhd() AliHLTTPCCATracklet *Tracklets() const { return fTracklets;} + GPUhd() int* TrackletRowHits() { return fTrackletRowHits; } - GPUhd() int *NTracks() const { return fNTracks; } + GPUhd() int *NTracks() const { return &fCommonMem->fNTracks; } GPUhd() AliHLTTPCCATrack *Tracks() const { return fTracks; } - GPUhd() int *NTrackHits() const { return fNTrackHits; } + GPUhd() int *NTrackHits() const { return &fCommonMem->fNTrackHits; } GPUhd() AliHLTTPCCAHitId *TrackHits() const { return fTrackHits; } - GPUhd() const AliHLTTPCCASliceOutput * Output() const { return fOutput; } - - GPUhd() int *NOutTracks() const { return fNOutTracks; } - GPUhd() AliHLTTPCCAOutTrack *OutTracks() const { return fOutTracks; } - GPUhd() const AliHLTTPCCAOutTrack &OutTrack( int index ) const { return fOutTracks[index]; } - GPUhd() int *NOutTrackHits() const { return fNOutTrackHits; } - GPUhd() int *OutTrackHits() const { return fOutTrackHits; } - GPUhd() int OutTrackHit( int i ) const { return fOutTrackHits[i]; } - -#ifndef CUDA_DEVICE_EMULATION - private: + GPUhd() AliHLTTPCCASliceOutput * Output() const { return fOutput; } + + GPUh() commonMemoryStruct *CommonMemory() {return(fCommonMem); } + static GPUh() size_t CommonMemorySize() { return(sizeof(AliHLTTPCCATracker::commonMemoryStruct)); } + GPUh() char* &HitMemory() {return(fHitMemory); } + GPUh() size_t HitMemorySize() const {return(fHitMemorySize); } + GPUh() char* &TrackletMemory() {return(fTrackletMemory); } + GPUh() size_t TrackletMemorySize() const {return(fTrackletMemorySize); } + GPUh() char* &TrackMemory() {return(fTrackMemory); } + GPUh() size_t TrackMemorySize() const {return(fTrackMemorySize); } + GPUh() char* OutputMemory() const {return(fOutput->Memory()); } + GPUh() size_t OutputMemorySize() const {return(fOutput->MemorySize()); } + GPUhd() AliHLTTPCCARow* SliceDataRows() {return(fData.Rows()); } + + GPUhd() uint3* RowStartHitCountOffset() const {return(fRowStartHitCountOffset);} + GPUhd() AliHLTTPCCATrackletConstructor::AliHLTTPCCAGPUTempMemory* GPUTrackletTemp() const {return(fGPUTrackletTemp);} + GPUhd() int* RowBlockTracklets(int reverse, int iRowBlock) const {return(&fRowBlockTracklets[(reverse * ((fParam.NRows() / HLTCA_GPU_SCHED_ROW_STEP) + 1) + iRowBlock) * fCommonMem->fNTracklets]);} + GPUhd() int* RowBlockTracklets() const {return(fRowBlockTracklets);} + GPUhd() int4* RowBlockPos(int reverse, int iRowBlock) const {return(&fRowBlockPos[reverse * ((fParam.NRows() / HLTCA_GPU_SCHED_ROW_STEP) + 1) + iRowBlock]);} + GPUhd() int4* RowBlockPos() const {return(fRowBlockPos);} + GPUhd() uint2* BlockStartingTracklet() const {return(fBlockStartingTracklet);} + GPUhd() StructGPUParameters* GPUParameters() const {return(&fCommonMem->fGPUParameters);} + GPUhd() StructGPUParametersConst* GPUParametersConst() {return(&fGPUParametersConst);} + + GPUh() unsigned long long int* PerfTimer(unsigned int i) {return &fPerfTimers[i]; } + +#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE + char* fStageAtSync; //Pointer to array storing current stage for every thread at every sync point + int* fThreadTimes; #endif - void SetupCommonMemory(); - + private: AliHLTTPCCAParam fParam; // parameters - double fTimers[10]; // running CPU time for different parts of the algorithm + double fTimers[10]; + unsigned long long int fPerfTimers[16]; // running CPU time for different parts of the algorithm + void StandalonePerfTime(int i); /** A pointer to the ClusterData object that the SliceData was created from. This can be used to * merge clusters from inside the SliceTracker code and recreate the SliceData. */ AliHLTTPCCAClusterData *fClusterData; // ^ AliHLTTPCCASliceData fData; // The SliceData object. It is used to encapsulate the storage in memory from the access - //Will this tracker run on GPU? - bool fIsGPUTracker; // is it GPU tracker - int fGPUDebugLevel; // debug level - std::ostream *fGPUDebugOut; // debug stream + bool fIsGPUTracker; // is it GPU tracker object + int fGPUDebugLevel; // debug level + std::ostream *fGPUDebugOut; // debug stream - // event + //GPU Temp Arrays + uint3* fRowStartHitCountOffset; //Offset, length and new offset of start hits in row + AliHLTTPCCAHitId *fTrackletTmpStartHits; //Unsorted start hits + AliHLTTPCCATrackletConstructor::AliHLTTPCCAGPUTempMemory *fGPUTrackletTemp; //Temp Memory for GPU Tracklet Constructor + int* fRowBlockTracklets; //Reference which tracklet is processed in which rowblock next + int4* fRowBlockPos; //x is last tracklet to be processed, y is last tracklet already processed, z is last tracklet to be processed in next iteration, w is initial x value to check if tracklet must be initialized + uint2* fBlockStartingTracklet; - char *fCommonMemory; // common event memory - size_t fCommonMemorySize; // size of the event memory [bytes] + StructGPUParametersConst fGPUParametersConst; + + // event + + commonMemoryStruct *fCommonMem; // common event memory char *fHitMemory; // event memory for hits size_t fHitMemorySize; // size of the event memory [bytes] + char *fTrackletMemory; + size_t fTrackletMemorySize; + char *fTrackMemory; // event memory for tracks size_t fTrackMemorySize; // size of the event memory [bytes] - - int *fNTracklets; // number of tracklets AliHLTTPCCAHitId *fTrackletStartHits; // start hits for the tracklets AliHLTTPCCATracklet *fTracklets; // tracklets + int *fTrackletRowHits; // - int *fNTracks; // number of reconstructed tracks AliHLTTPCCATrack *fTracks; // reconstructed tracks - int *fNTrackHits; // number of track hits AliHLTTPCCAHitId *fTrackHits; // array of track hit numbers // output AliHLTTPCCASliceOutput *fOutput; - // obsolete output - - int *fNOutTracks; // number of tracks in fOutTracks array - AliHLTTPCCAOutTrack *fOutTracks; // output array of the reconstructed tracks - int *fNOutTrackHits; // number of hits in fOutTrackHits array - int *fOutTrackHits; // output array of ID's of the reconstructed hits - // disable copy AliHLTTPCCATracker( const AliHLTTPCCATracker& ); AliHLTTPCCATracker &operator=( const AliHLTTPCCATracker& ); + + static int starthitSortComparison(const void*a, const void* b); }; diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.cxx index 451e30840ff..fe9ec4b3371 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.cxx @@ -30,7 +30,7 @@ using namespace std; #include "AliHLTTPCCATrackerComponent.h" #include "AliHLTTPCTransform.h" -#include "AliHLTTPCCATracker.h" +#include "AliHLTTPCCATrackerFramework.h" #include "AliHLTTPCCAOutTrack.h" #include "AliHLTTPCCAParam.h" #include "AliHLTTPCCATrackConvertor.h" @@ -61,6 +61,7 @@ ClassImp( AliHLTTPCCATrackerComponent ) AliHLTTPCCATrackerComponent::AliHLTTPCCATrackerComponent() : fTracker( NULL ), + fOutput( NULL ), fSolenoidBz( 0 ), fMinNTrackClusters( 0 ), fClusterZCut( 500. ), @@ -83,6 +84,7 @@ AliHLTTPCCATrackerComponent::AliHLTTPCCATrackerComponent( const AliHLTTPCCATrack : AliHLTProcessor(), fTracker( NULL ), + fOutput( NULL), fSolenoidBz( 0 ), fMinNTrackClusters( 30 ), fClusterZCut( 500. ), @@ -109,6 +111,7 @@ AliHLTTPCCATrackerComponent::~AliHLTTPCCATrackerComponent() { // see header file for class documentation delete fTracker; + delete fOutput; } // @@ -322,9 +325,11 @@ int AliHLTTPCCATrackerComponent::DoInit( int argc, const char** argv ) { // Configure the CA tracker component - if ( fTracker ) return EINPROGRESS; + if ( fTracker || fOutput ) return EINPROGRESS; - fTracker = new AliHLTTPCCATracker(); + + fTracker = new AliHLTTPCCATrackerFramework(); + fOutput = new AliHLTTPCCASliceOutput(); TString arguments = ""; for ( int i = 0; i < argc; i++ ) { @@ -341,6 +346,8 @@ int AliHLTTPCCATrackerComponent::DoDeinit() // see header file for class documentation delete fTracker; fTracker = NULL; + delete fOutput; + fOutput = NULL; return 0; } @@ -459,7 +466,8 @@ int AliHLTTPCCATrackerComponent::DoEvent { - if ( !fTracker ) fTracker = new AliHLTTPCCATracker; + if ( !fTracker ) fTracker = new AliHLTTPCCATrackerFramework; + if ( !fOutput ) fOutput = new AliHLTTPCCASliceOutput; int iSec = slice; float inRmin = 83.65; // float inRmax = 133.3; @@ -497,7 +505,7 @@ int AliHLTTPCCATrackerComponent::DoEvent if( fClusterErrorCorrectionY>1.e-4 ) param.SetClusterError2CorrectionY( fClusterErrorCorrectionY*fClusterErrorCorrectionY ); if( fClusterErrorCorrectionZ>1.e-4 ) param.SetClusterError2CorrectionZ( fClusterErrorCorrectionZ*fClusterErrorCorrectionZ ); param.Update(); - fTracker->Initialize( param ); + fTracker->InitializeSliceParam( slice, param ); delete[] rowX; } @@ -575,22 +583,20 @@ int AliHLTTPCCATrackerComponent::DoEvent TStopwatch timerReco; - fTracker->ReadEvent( &clusterData ); - - fTracker->Reconstruct(); + fTracker->ProcessSlices(slice, 1, &clusterData, fOutput); timerReco.Stop(); int ret = 0; Logging( kHLTLogDebug, "HLT::TPCCATracker::DoEvent", "Reconstruct", - "%d tracks found for slice %d", fTracker->NOutTracks(), slice ); + "%d tracks found for slice %d", fOutput->NOutTracks(), slice ); // write reconstructed tracks unsigned int mySize = 0; - int ntracks = *fTracker->NOutTracks(); + int ntracks = fOutput->NOutTracks(); if ( fOutputTRAKSEGS ) { @@ -605,7 +611,7 @@ int AliHLTTPCCATrackerComponent::DoEvent for ( int itr = 0; itr < ntracks; itr++ ) { - AliHLTTPCCAOutTrack &t = fTracker->OutTracks()[itr]; + AliHLTTPCCAOutTrack &t = fOutput->OutTracks()[itr]; //Logging( kHLTLogDebug, "HLT::TPCCATracker::DoEvent", "Wrtite output","track %d with %d hits", itr, t.NHits()); @@ -625,10 +631,10 @@ int AliHLTTPCCATrackerComponent::DoEvent int iFirstRow = 1000; int iLastRow = -1; - int iFirstHit = fTracker->OutTrackHits()[t.FirstHitRef()]; + int iFirstHit = fOutput->OutTrackHit(t.FirstHitRef()); int iLastHit = iFirstHit; for ( int ih = 0; ih < t.NHits(); ih++ ) { - int hitID = fTracker->OutTrackHits()[t.FirstHitRef() + ih ]; + int hitID = fOutput->OutTrackHit(t.FirstHitRef() + ih); int iRow = clusterData.RowNumber( hitID ); if ( iRow < iFirstRow ) { iFirstRow = iRow; iFirstHit = hitID; } if ( iRow > iLastRow ) { iLastRow = iRow; iLastHit = hitID; } @@ -687,7 +693,7 @@ int AliHLTTPCCATrackerComponent::DoEvent currOutTracklet->fNPoints = t.NHits(); for ( int i = 0; i < t.NHits(); i++ ) { - currOutTracklet->fPointIDs[i] = clusterData.Id( fTracker->OutTrackHits()[t.FirstHitRef()+i] ); + currOutTracklet->fPointIDs[i] = clusterData.Id( fOutput->OutTrackHit(t.FirstHitRef()+i) ); } currOutTracklet = ( AliHLTTPCTrackSegmentData* )( ( Byte_t * )currOutTracklet + dSize ); @@ -697,10 +703,10 @@ int AliHLTTPCCATrackerComponent::DoEvent } else { // default output type - mySize = fTracker->Output()->EstimateSize( fTracker->Output()->NTracks(), - fTracker->Output()->NTrackClusters() ); + mySize = fOutput->EstimateSize( fOutput->NTracks(), + fOutput->NTrackClusters() ); if ( mySize <= maxBufferSize ) { - const AliHLTUInt8_t* outputevent = reinterpret_cast( fTracker->Output() ); + const AliHLTUInt8_t* outputevent = reinterpret_cast( fOutput ); for ( unsigned int i = 0; i < mySize; i++ ) outputPtr[i] = outputevent[i]; } else { HLTWarning( "Output buffer size exceed (buffer size %d, current size %d), tracks are not stored", maxBufferSize, mySize ); diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.h index bd780bfe8b6..3b88c919f24 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.h @@ -12,7 +12,8 @@ #include "AliHLTProcessor.h" -class AliHLTTPCCATracker; +class AliHLTTPCCATrackerFramework; +class AliHLTTPCCASliceOutput; class AliHLTTPCSpacePointData; /** @@ -75,7 +76,8 @@ class AliHLTTPCCATrackerComponent : public AliHLTProcessor private: /** the tracker object */ - AliHLTTPCCATracker* fTracker; //! transient + AliHLTTPCCATrackerFramework* fTracker; //! transient + AliHLTTPCCASliceOutput* fOutput; /** magnetic field */ double fSolenoidBz; // see above diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx new file mode 100644 index 00000000000..ed9040e2e4e --- /dev/null +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx @@ -0,0 +1,119 @@ +// @(#) $Id: AliHLTTPCCATracker.cxx 34611 2009-09-04 00:22:05Z sgorbuno $ +// ************************************************************************** +// This file is property of and copyright by the ALICE HLT Project * +// ALICE Experiment at CERN, All rights reserved. * +// * +// Primary Authors: Sergey Gorbunov * +// Ivan Kisel * +// for The ALICE HLT Project. * +// * +// Permission to use, copy, modify and distribute this software and its * +// documentation strictly for non-commercial purposes is hereby granted * +// without fee, provided that the above copyright notice appears in all * +// copies and that both the copyright notice and this permission notice * +// appear in the supporting documentation. The authors make no claims * +// about the suitability of this software for any purpose. It is * +// provided "as is" without express or implied warranty. * +// * +//*************************************************************************** + +#include "AliHLTTPCCADef.h" +#include "AliHLTTPCCAGPUConfig.h" +#include "AliHLTTPCCATrackerFramework.h" +#include "AliHLTTPCCAGPUTracker.h" +#include "AliHLTTPCCATracker.h" +#include "AliHLTTPCCAMath.h" +#include "AliHLTTPCCAClusterData.h" + +#ifdef HLTCA_STANDALONE +#include +#endif + +int AliHLTTPCCATrackerFramework::InitGPU(int sliceCount, int forceDeviceID) +{ + //Initialize GPU Tracker and determine if GPU available + int retVal; + if (fGPUTrackerAvailable && (retVal = ExitGPU())) return(retVal); + retVal = fGPUTracker.InitGPU(sliceCount, forceDeviceID); + fUseGPUTracker = fGPUTrackerAvailable = retVal == 0; + fGPUSliceCount = sliceCount; + return(retVal); +} + +int AliHLTTPCCATrackerFramework::ExitGPU() +{ + //Uninitialize GPU Tracker + if (!fGPUTrackerAvailable) return(0); + fUseGPUTracker = false; + fGPUTrackerAvailable = false; + return(fGPUTracker.ExitGPU()); +} + +void AliHLTTPCCATrackerFramework::SetGPUDebugLevel(int Level, std::ostream *OutFile, std::ostream *GPUOutFile) +{ + //Set Debug Level for GPU Tracker and also for CPU Tracker for comparison reasons + fGPUTracker.SetDebugLevel(Level, GPUOutFile); + fGPUDebugLevel = Level; + for (int i = 0;i < fgkNSlices;i++) + { + fCPUTrackers[i].SetGPUDebugLevel(Level, OutFile); + } +} + +int AliHLTTPCCATrackerFramework::SetGPUTracker(bool enable) +{ + //Enable / disable GPU Tracker + if (enable && !fGPUTrackerAvailable) + { + fUseGPUTracker = false; + return(1); + } + fUseGPUTracker = enable; + return(0); +} + +int AliHLTTPCCATrackerFramework::ProcessSlices(int firstSlice, int sliceCount, AliHLTTPCCAClusterData* pClusterData, AliHLTTPCCASliceOutput* pOutput) +{ + //Process sliceCount slices starting from firstslice, in is pClusterData array, out pOutput array + if (fUseGPUTracker) + { + if (fGPUTracker.Reconstruct(pOutput, pClusterData, firstSlice, CAMath::Min(sliceCount, fgkNSlices - firstSlice))) return(1); + } + else + { +#ifdef HLTCA_STANDALONE +#pragma omp parallel for +#endif + for (int iSlice = 0;iSlice < CAMath::Min(sliceCount, fgkNSlices - firstSlice);iSlice++) + { + fCPUTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]); + fCPUTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]); + fCPUTrackers[firstSlice + iSlice].Reconstruct(); + fCPUTrackers[firstSlice + iSlice].SetupCommonMemory(); + } + } + + if (fGPUDebugLevel >= 6 && fUseGPUTracker) + { + fUseGPUTracker = 0; + ProcessSlices(firstSlice, sliceCount, pClusterData, pOutput); + fUseGPUTracker = 1; + } + + //printf("Slice Tracks Output: %d\n", pOutput[0].NTracks()); + return(0); +} + +unsigned long long int* AliHLTTPCCATrackerFramework::PerfTimer(int GPU, int iSlice, int iTimer) +{ + //Performance information for slice trackers + return(GPU ? fGPUTracker.PerfTimer(iSlice, iTimer) : fCPUTrackers[iSlice].PerfTimer(iTimer)); +} + +int AliHLTTPCCATrackerFramework::InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m) +{ + //Initialize Tracker Parameters for a slice + if (fGPUTrackerAvailable && fGPUTracker.InitializeSliceParam(iSlice, param)) return(1); + fCPUTrackers[iSlice].Initialize(param); + return(0); +} \ No newline at end of file diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.h new file mode 100644 index 00000000000..9cace75d953 --- /dev/null +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.h @@ -0,0 +1,66 @@ +//-*- Mode: C++ -*- +// @(#) $Id: AliHLTTPCCATracker.h 33907 2009-07-23 13:52:49Z sgorbuno $ +// ************************************************************************ +// This file is property of and copyright by the ALICE HLT Project * +// ALICE Experiment at CERN, All rights reserved. * +// See cxx source for full Copyright notice * +// * +//************************************************************************* + +#ifndef ALIHLTTPCCATRACKERFRAMEWORK_H +#define ALIHLTTPCCATRACKERFRAMEWORK_H + +#include "AliHLTTPCCATracker.h" +#include "AliHLTTPCCAGPUTracker.h" +#include "AliHLTTPCCAParam.h" +#include + +class AliHLTTPCCASliceOutput; +class AliHLTTPCCAClusterData; + +class AliHLTTPCCATrackerFramework +{ +public: + AliHLTTPCCATrackerFramework() : + fGPUTrackerAvailable(false), fUseGPUTracker(false), fGPUDebugLevel(0), fGPUSliceCount(0), fGPUTracker(), fCPUSliceCount(fgkNSlices) + { + fGPUTrackerAvailable = fUseGPUTracker = fGPUSliceCount = (fGPUTracker.InitGPU(1, -1) == 0); + } + ~AliHLTTPCCATrackerFramework() + {} + + int InitGPU(int sliceCount = 1, int forceDeviceID = -1); + int ExitGPU(); + void SetGPUDebugLevel(int Level, std::ostream *OutFile = NULL, std::ostream *GPUOutFile = NULL); + int SetGPUTrackerOption(char* OptionName, int OptionValue) {return(fGPUTracker.SetGPUTrackerOption(OptionName, OptionValue));} + int SetGPUTracker(bool enable); + + int InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m); + + int ProcessSlices(int firstSlice, int sliceCount, AliHLTTPCCAClusterData* pClusterData, AliHLTTPCCASliceOutput* pOutput); + unsigned long long int* PerfTimer(int GPU, int iSlice, int iTimer); + + int MaxSliceCount() const { return(fUseGPUTracker ? fGPUSliceCount : fCPUSliceCount); } + int GetGPUStatus() const { return(fGPUTrackerAvailable + fUseGPUTracker); } + + const AliHLTTPCCAParam& Param(int iSlice) const { return(fCPUTrackers[iSlice].Param()); } + const AliHLTTPCCARow& Row(int iSlice, int iRow) const { return(fCPUTrackers[iSlice].Row(iRow)); } //TODO: Should be changed to return only row parameters + +private: + static const int fgkNSlices = 36; //* N slices + + bool fGPUTrackerAvailable; // Is the GPU Tracker Available? + bool fUseGPUTracker; // use the GPU tracker + int fGPUDebugLevel; // debug level for the GPU code + int fGPUSliceCount; //How many slices to process parallel + AliHLTTPCCAGPUTracker fGPUTracker; + + AliHLTTPCCATracker fCPUTrackers[fgkNSlices]; + int fCPUSliceCount; + + AliHLTTPCCATrackerFramework( const AliHLTTPCCATrackerFramework& ); + AliHLTTPCCATrackerFramework &operator=( const AliHLTTPCCATrackerFramework& ); + +}; + +#endif diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATracklet.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCATracklet.h index 5871c81c627..0fa7aa7fbf2 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATracklet.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATracklet.h @@ -11,6 +11,7 @@ #include "AliHLTTPCCADef.h" #include "AliHLTTPCCATrackParam.h" +#include "AliHLTTPCCAGPUConfig.h" /** * @class ALIHLTTPCCATracklet @@ -23,35 +24,36 @@ class AliHLTTPCCATracklet public: #if !defined(HLTCA_GPUCODE) - AliHLTTPCCATracklet() : fStartHitID( 0 ), fNHits( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fParam() {}; + AliHLTTPCCATracklet() : fNHits( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fParam() {}; void Dummy() const ; ~AliHLTTPCCATracklet() {} #endif - GPUhd() int StartHitID() const { return fStartHitID; } GPUhd() int NHits() const { return fNHits; } GPUhd() int FirstRow() const { return fFirstRow; } GPUhd() int LastRow() const { return fLastRow; } - GPUhd() const AliHLTTPCCATrackParam &Param() const { return fParam; } + GPUhd() const AliHLTTPCCATrackParam &Param() const { return fParam; } +#ifndef EXTERN_ROW_HITS GPUhd() int RowHit( int i ) const { return fRowHits[i]; } + GPUhd() const int* RowHits() const { return(fRowHits); } +#endif - GPUhd() void SetStartHitID( int v ) { fStartHitID = v; } GPUhd() void SetNHits( int v ) { fNHits = v; } GPUhd() void SetFirstRow( int v ) { fFirstRow = v; } GPUhd() void SetLastRow( int v ) { fLastRow = v; } GPUhd() void SetParam( const AliHLTTPCCATrackParam &v ) { fParam = v; } +#ifndef EXTERN_ROW_HITS GPUhd() void SetRowHit( int irow, int ih ) { fRowHits[irow] = ih; } - -#ifndef CUDA_DEVICE_EMULATION - private: #endif - int fStartHitID; // ID of the starting hit + private: int fNHits; // N hits int fFirstRow; // first TPC row int fLastRow; // last TPC row AliHLTTPCCATrackParam fParam; // tracklet parameters - int fRowHits[160]; // hit index for each TPC row +#ifndef EXTERN_ROW_HITS + int fRowHits[HLTCA_ROW_COUNT + 1]; // hit index for each TPC row +#endif }; #endif diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.cxx index 4d6ca14dc9f..420d387c3ea 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.cxx @@ -21,11 +21,12 @@ #include "AliHLTTPCCATrackParam.h" #include "AliHLTTPCCATrackParam.h" #include "AliHLTTPCCAGrid.h" -#include "AliHLTTPCCAHitArea.h" #include "AliHLTTPCCAMath.h" #include "AliHLTTPCCADef.h" #include "AliHLTTPCCATracklet.h" #include "AliHLTTPCCATrackletConstructor.h" +#include "MemoryAssignmentHelpers.h" + //#include "AliHLTTPCCAPerformance.h" //#include "TH1D.h" @@ -35,58 +36,11 @@ #include "AliHLTTPCCADisplay.h" #endif +#define kMaxRowGap 4 -GPUd() void AliHLTTPCCATrackletConstructor::Step0 -( int nBlocks, int /*nThreads*/, int iBlock, int iThread, - AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &/*tParam*/ ) -{ - // reconstruction of tracklets, step 0 - - r.fIsMemThread = ( iThread < TRACKLET_CONSTRUCTOR_NMEMTHREDS ); - if ( iThread == 0 ) { - int nTracks = *tracker.NTracklets(); - int nTrPerBlock = nTracks / nBlocks + 1; - s.fNRows = tracker.Param().NRows(); - s.fItr0 = nTrPerBlock * iBlock; - s.fItr1 = s.fItr0 + nTrPerBlock; - if ( s.fItr1 > nTracks ) s.fItr1 = nTracks; - s.fMinStartRow = 158; - s.fMaxEndRow = 0; - } - if ( iThread < 32 ) { - s.fMinStartRow32[iThread] = 158; - } -} - - -GPUd() void AliHLTTPCCATrackletConstructor::Step1 -( int /*nBlocks*/, int /*nThreads*/, int /*iBlock*/, int iThread, - AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam ) +GPUd() void AliHLTTPCCATrackletConstructor::InitTracklet( AliHLTTPCCATrackParam &tParam ) { - // reconstruction of tracklets, step 1 - - r.fItr = s.fItr0 + ( iThread - TRACKLET_CONSTRUCTOR_NMEMTHREDS ); - r.fGo = ( !r.fIsMemThread ) && ( r.fItr < s.fItr1 ); - r.fSave = r.fGo; - r.fNHits = 0; - - if ( !r.fGo ) return; - - r.fStage = 0; - - AliHLTTPCCATracklet &tracklet = tracker.Tracklets()[r.fItr]; - - unsigned int kThread = iThread % 32;//& 00000020; - if ( SAVE() ) for ( int i = 0; i < 160; i++ ) tracklet.SetRowHit( i, -1 ); - - AliHLTTPCCAHitId id = tracker.TrackletStartHits()[r.fItr]; - r.fStartRow = id.RowIndex(); - r.fEndRow = r.fStartRow; - r.fFirstRow = r.fStartRow; - r.fLastRow = r.fFirstRow; - r.fCurrIH = id.HitIndex(); - - CAMath::AtomicMin( &s.fMinStartRow32[kThread], r.fStartRow ); + //Initialize Tracklet Parameters using default values tParam.SetSinPhi( 0 ); tParam.SetDzDs( 0 ); tParam.SetQPt( 0 ); @@ -108,66 +62,110 @@ GPUd() void AliHLTTPCCATrackletConstructor::Step1 tParam.SetCov( 12, 0 ); tParam.SetCov( 13, 0 ); tParam.SetCov( 14, 10. ); - -} - -GPUd() void AliHLTTPCCATrackletConstructor::Step2 -( int /*nBlocks*/, int nThreads, int /*iBlock*/, int iThread, - AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &/*r*/, AliHLTTPCCATracker &/*tracker*/, AliHLTTPCCATrackParam &/*tParam*/ ) -{ - // reconstruction of tracklets, step 2 - - if ( iThread == 0 ) { - //CAMath::AtomicMinGPU(&s.fMinRow, s.fMinRow32[iThread]); - int minStartRow = 158; - int n = ( nThreads > 32 ) ? 32 : nThreads; - for ( int i = 0; i < n; i++ ) { - if ( s.fMinStartRow32[i] < minStartRow ) minStartRow = s.fMinStartRow32[i]; - } - s.fMinStartRow = minStartRow; - } } GPUd() void AliHLTTPCCATrackletConstructor::ReadData -( int iThread, AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, int iRow ) +#ifndef HLTCA_GPU_PREFETCHDATA +( int /*iThread*/, AliHLTTPCCASharedMemory& /*s*/, AliHLTTPCCAThreadMemory& /*r*/, AliHLTTPCCATracker& /*tracker*/, int /*iRow*/ ) +{ + //Prefetch Data to shared memory +#else +( int iThread, AliHLTTPCCASharedMemory& s, AliHLTTPCCAThreadMemory& r, AliHLTTPCCATracker& tracker, int iRow ) { // reconstruction of tracklets, read data step - - if ( r.fIsMemThread ) { const AliHLTTPCCARow &row = tracker.Row( iRow ); - bool jr = !r.fCurrentData; + //bool jr = !r.fCurrentData; // copy hits, grid content and links // FIXME: inefficient copy - const int numberOfHits = row.NHits(); + //const int numberOfHitsAligned = NextMultipleOf(row.NHits()); + +/* +#ifdef HLTCA_GPU_REORDERHITDATA ushort2 *sMem1 = reinterpret_cast( s.fData[jr] ); - for ( int i = iThread; i < numberOfHits; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) { + for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) { sMem1[i].x = tracker.HitDataY( row, i ); sMem1[i].y = tracker.HitDataZ( row, i ); } - short *sMem2 = reinterpret_cast( s.fData[jr] ) + 2 * numberOfHits; - for ( int i = iThread; i < numberOfHits; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) { - sMem2[i] = tracker.HitLinkUpData( row, i ); +#else + ushort_v *sMem1 = reinterpret_cast( s.fData[jr] ); + for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) { + sMem1[i] = tracker.HitDataY( row, i ); + } + + ushort_v *sMem1a = reinterpret_cast( s.fData[jr] ) + numberOfHitsAligned; + for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) { + sMem1a[i] = tracker.HitDataZ( row, i ); } +#endif - unsigned short *sMem3 = reinterpret_cast( s.fData[jr] ) + 3 * numberOfHits; + short *sMem2 = reinterpret_cast( s.fData[jr] ) + 2 * numberOfHitsAligned; + for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) { + sMem2[i] = tracker.HitLinkUpData( row, i ); + } + + unsigned short *sMem3 = reinterpret_cast( s.fData[jr] ) + 3 * numberOfHitsAligned; const int n = row.FullSize(); // + grid content size for ( int i = iThread; i < n; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) { sMem3[i] = tracker.FirstHitInBin( row, i ); - } - } + }*/ + + /*for (int k = 0;k < 2;k++) + { + HLTCA_GPU_ROWCOPY* sharedMem; + const HLTCA_GPU_ROWCOPY* sourceMem; + int copyCount; + switch (k) + { + case 0: + sourceMem = reinterpret_cast( tracker.HitDataY(row) ); + sharedMem = reinterpret_cast (reinterpret_cast( s.fData[jr] ) + k * numberOfHitsAligned); + copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY); + break; + case 1: + sourceMem = reinterpret_cast( tracker.HitDataZ(row) ); + sharedMem = reinterpret_cast (reinterpret_cast( s.fData[jr] ) + k * numberOfHitsAligned); + copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY); + break; + case 2: + sourceMem = reinterpret_cast( tracker.HitLinkUpData(row) ); + sharedMem = reinterpret_cast (reinterpret_cast( s.fData[jr] ) + k * numberOfHitsAligned); + copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY); + break; + case 1: + sourceMem = reinterpret_cast( tracker.FirstHitInBin(row) ); + sharedMem = reinterpret_cast (reinterpret_cast( s.fData[jr] ) + k * numberOfHitsAligned); + copyCount = NextMultipleOf(row.FullSize()) * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY); + break; + } + for (int i = iThread;i < copyCount;i += TRACKLET_CONSTRUCTOR_NMEMTHREDS) + { + sharedMem[i] = sourceMem[i]; + } + }*/ + + for (unsigned int i = iThread;i < row.FullSize() * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);i += TRACKLET_CONSTRUCTOR_NMEMTHREDS) + { + reinterpret_cast (reinterpret_cast( s.fData[!r.fCurrentData] ))[i] = reinterpret_cast( tracker.FirstHitInBin(row) )[i]; + } + + const HLTCA_GPU_ROWCOPY* const sourceMem = (const HLTCA_GPU_ROWCOPY *) &row; + HLTCA_GPU_ROWCOPY* const sharedMem = reinterpret_cast ( &s.fRow[!r.fCurrentData] ); + for (unsigned int i = iThread;i < sizeof(AliHLTTPCCARow) / sizeof(HLTCA_GPU_ROWCOPY);i += TRACKLET_CONSTRUCTOR_NMEMTHREDS) + { + sharedMem[i] = sourceMem[i]; + } +#endif } GPUd() void AliHLTTPCCATrackletConstructor::StoreTracklet ( int /*nBlocks*/, int /*nThreads*/, int /*iBlock*/, int /*iThread*/, - AliHLTTPCCASharedMemory &/*s*/, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam ) + AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam ) { // reconstruction of tracklets, tracklet store step - if ( !r.fSave ) return; - //AliHLTTPCCAPerformance::Instance().HNHitsPerTrackCand()->Fill(r.fNHits); do { @@ -175,7 +173,7 @@ GPUd() void AliHLTTPCCATrackletConstructor::StoreTracklet //std::cout<<"tracklet to store: "<( tmpint4 ); +//#ifdef HLTCA_GPU_REORDERHITDATA +// const ushort2 *hits = reinterpret_cast( tmpint4 ); +//#else +//#ifdef HLTCA_GPU_PREFETCHDATA +// const ushort_v *hitsx = reinterpret_cast( tmpint4 ); +// const ushort_v *hitsy = reinterpret_cast( tmpint4 ) + NextMultipleOf(row.NHits()); +//#else +#ifndef HLTCA_GPU_TEXTURE_FETCH + const ushort2 *hits = tracker.HitData(row); +#endif +//#endif +//#endif float fY = tParam.GetY(); float fZ = tParam.GetZ(); @@ -468,15 +546,29 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet { int nY = row.Grid().Ny(); - unsigned short *sGridP = ( reinterpret_cast( tmpint4 ) ) + 3 * row.NHits(); +//#ifdef HLTCA_GPU_PREFETCHDATA +// const unsigned short *sGridP = ( reinterpret_cast( tmpint4 ) ); +//#else +#ifndef HLTCA_GPU_TEXTURE_FETCH + const unsigned short *sGridP = tracker.FirstHitInBin(row); +#endif +//#endif + +#ifdef HLTCA_GPU_TEXTURE_FETCH + fHitYfst = tex1Dfetch(gAliTexRefu, ((char*) tracker.Data().FirstHitInBin(row) - tracker.pData()->GPUTextureBase()) / sizeof(unsigned short) + fIndYmin); + fHitYlst = tex1Dfetch(gAliTexRefu, ((char*) tracker.Data().FirstHitInBin(row) - tracker.pData()->GPUTextureBase()) / sizeof(unsigned short) + fIndYmin+2); + fHitYfst1 = tex1Dfetch(gAliTexRefu, ((char*) tracker.Data().FirstHitInBin(row) - tracker.pData()->GPUTextureBase()) / sizeof(unsigned short) + fIndYmin+nY); + fHitYlst1 = tex1Dfetch(gAliTexRefu, ((char*) tracker.Data().FirstHitInBin(row) - tracker.pData()->GPUTextureBase()) / sizeof(unsigned short) + fIndYmin+nY+2); +#else fHitYfst = sGridP[fIndYmin]; fHitYlst = sGridP[fIndYmin+2]; fHitYfst1 = sGridP[fIndYmin+nY]; fHitYlst1 = sGridP[fIndYmin+nY+2]; - assert( fHitYfst <= row.NHits() ); - assert( fHitYlst <= row.NHits() ); - assert( fHitYfst1 <= row.NHits() ); - assert( fHitYlst1 <= row.NHits() ); +#endif + assert( (signed) fHitYfst <= row.NHits() ); + assert( (signed) fHitYlst <= row.NHits() ); + assert( (signed) fHitYfst1 <= row.NHits() ); + assert( (signed) fHitYlst1 <= row.NHits() ); if ( drawSearch ) { #ifdef DRAW std::cout << " Grid, row " << iRow << ": nHits=" << row.NHits() << ", grid n=" << row.Grid().N() << ", c[n]=" << sGridP[row.Grid().N()] << std::endl; @@ -494,22 +586,27 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet } #endif } - if ( sGridP[row.Grid().N()] != row.NHits() ) { #ifdef DRAW + if ( sGridP[row.Grid().N()] != row.NHits() ) { std::cout << " grid, row " << iRow << ": nHits=" << row.NHits() << ", grid n=" << row.Grid().N() << ", c[n]=" << sGridP[row.Grid().N()] << std::endl; //exit(0); -#endif } +#endif } +#ifdef DRAW if ( drawSearch ) { - #ifdef DRAW std::cout << " tracklet " << r.fItr << ", row " << iRow << ", yz= " << fY << "," << fZ << ": search hits=" << fHitYfst << " " << fHitYlst << " / " << fHitYfst1 << " " << fHitYlst1 << std::endl; std::cout << " hit search :" << std::endl; - #endif } +#endif for ( unsigned int fIh = fHitYfst; fIh < fHitYlst; fIh++ ) { - assert( fIh < row.NHits() ); - ushort2 hh = hits[fIh]; + assert( (signed) fIh < row.NHits() ); + ushort2 hh; +#if defined(HLTCA_GPU_TEXTURE_FETCH) + hh = tex1Dfetch(gAliTexRefu2, ((char*) tracker.Data().HitData() - tracker.pData()->GPUTextureBase()) / sizeof(ushort2) + row.HitNumberOffset() + fIh); +#else + hh = hits[fIh]; +#endif int ddy = ( int )( hh.x ) - fY0; int ddz = ( int )( hh.y ) - fZ0; int dds = CAMath::Abs( ddy ) + CAMath::Abs( ddz ); @@ -524,8 +621,13 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet } } - for ( unsigned int fIh = fHitYfst1; fIh < fHitYlst1; fIh++ ) { - ushort2 hh = hits[fIh]; + for ( unsigned int fIh = fHitYfst1; fIh < fHitYlst1; fIh++ ) { + ushort2 hh; +#if defined(HLTCA_GPU_TEXTURE_FETCH) + hh = tex1Dfetch(gAliTexRefu2, ((char*) tracker.Data().HitData() - tracker.pData()->GPUTextureBase()) / sizeof(ushort2) + row.HitNumberOffset() + fIh); +#else + hh = hits[fIh]; +#endif int ddy = ( int )( hh.x ) - fY0; int ddz = ( int )( hh.y ) - fZ0; int dds = CAMath::Abs( ddy ) + CAMath::Abs( ddz ); @@ -541,7 +643,15 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet } }// end of search for the closest hit - if ( best < 0 ) break; + if ( best < 0 ) + { +#ifndef EXTERN_ROW_HITS + tracklet.SetRowHit(iRow, -1); +#else + tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = -1; +#endif + break; + } if ( drawSearch ) { #ifdef DRAW std::cout << "hit search " << r.fItr << ", row " << iRow << " hit " << best << " found" << std::endl; @@ -552,7 +662,12 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet #endif } - ushort2 hh = hits[best]; + ushort2 hh; +#if defined(HLTCA_GPU_TEXTURE_FETCH) + hh = tex1Dfetch(gAliTexRefu2, ((char*) tracker.Data().HitData() - tracker.pData()->GPUTextureBase()) / sizeof(ushort2) + row.HitNumberOffset() + best); +#else + hh = hits[best]; +#endif //std::cout<<"mark 3, "<fGPUnSlices * (blockIdx.x + (HLTCA_GPU_BLOCK_COUNT % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % HLTCA_GPU_BLOCK_COUNT != 0)) / HLTCA_GPU_BLOCK_COUNT; + const int nSliceBlockOffset = HLTCA_GPU_BLOCK_COUNT * iSlice / tracker.GPUParametersConst()->fGPUnSlices; + const uint2 &nTracklet = tracker.BlockStartingTracklet()[blockIdx.x - nSliceBlockOffset]; + + sMem.fNextTrackletCount = nTracklet.y; + if (sMem.fNextTrackletCount == 0) + { + sMem.fNextTrackletFirstRun = 0; + } + else + { + if (tracker.TrackletStartHits()[nTracklet.x].RowIndex() / HLTCA_GPU_SCHED_ROW_STEP != RowBlock) + { + sMem.fNextTrackletCount = 0; + } + else + { + sMem.fNextTrackletFirst = nTracklet.x; + sMem.fNextTrackletNoDummy = 1; + } + } +#endif + } + else + { + const int nFetchTracks = CAMath::Max(CAMath::Min((*tracker.RowBlockPos(Reverse, RowBlock)).x - (*tracker.RowBlockPos(Reverse, RowBlock)).y, HLTCA_GPU_THREAD_COUNT - TRACKLET_CONSTRUCTOR_NMEMTHREDS), 0); + sMem.fNextTrackletCount = nFetchTracks; + const int nUseTrack = nFetchTracks ? CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).y, nFetchTracks) : 0; + sMem.fNextTrackletFirst = nUseTrack; + + const int nFillTracks = CAMath::Min(nFetchTracks, nUseTrack + nFetchTracks - (*((volatile int2*) (tracker.RowBlockPos(Reverse, RowBlock)))).x); + if (nFillTracks > 0) + { + const int nStartFillTrack = CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).x, nFillTracks); + if (nFillTracks + nStartFillTrack >= HLTCA_GPU_MAX_TRACKLETS) + { + tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_ROWBLOCK_TRACKLET_OVERFLOW; + } + for (int i = 0;i < nFillTracks;i++) + { + tracker.RowBlockTracklets(Reverse, RowBlock)[(nStartFillTrack + i) % HLTCA_GPU_MAX_TRACKLETS] = -3; //Dummy filling track + } + } + sMem.fNextTrackletNoDummy = 0; + } + } + __syncthreads(); + mustInit = 0; + if (sMem.fNextTrackletCount == 0) + { + return(-2); //No more track in this RowBlock + } +#if HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS > 0 + else if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS) + { + return(-1); + } +#endif + else if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS >= sMem.fNextTrackletCount) + { + return(-1); //No track in this RowBlock for this thread + } + else if (nextTracketlFirstRun) + { + if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS) sMem.fNextTrackletFirstRun = 0; + mustInit = 1; + return(sMem.fNextTrackletFirst + threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS); + } + else + { + const int nTrackPos = sMem.fNextTrackletFirst + threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS; + mustInit = (nTrackPos < tracker.RowBlockPos(Reverse, RowBlock)->w); + volatile int* const ptrTracklet = &tracker.RowBlockTracklets(Reverse, RowBlock)[nTrackPos % HLTCA_GPU_MAX_TRACKLETS]; + int nTracklet; + int nTryCount = 0; + while ((nTracklet = *ptrTracklet) == -1) + { + for (int i = 0;i < 10000;i++) + sMem.fNextTrackletStupidDummy++; + nTryCount++; + if (nTryCount > 20) + { + tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_SCHEDULE_COLLISION; + return(-1); + } + }; + return(nTracklet); + } +} -GPUd() void AliHLTTPCCATrackletConstructor::Thread -( int nBlocks, int nThreads, int iBlock, int iThread, int iSync, - AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam ) +GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorNewGPU(AliHLTTPCCATracker *pTracker) { + //Main Tracklet construction function that calls the scheduled (FetchTracklet) and then Processes the tracklet (mainly UpdataTracklet) and at the end stores the tracklet. + //Can also dispatch a tracklet to be rescheduled +#ifdef HLTCA_GPU_EMULATION_SINGLE_TRACKLET + pTracker[0].BlockStartingTracklet()[0].x = HLTCA_GPU_EMULATION_SINGLE_TRACKLET; + pTracker[0].BlockStartingTracklet()[0].y = 1; + for (int i = 1;i < HLTCA_GPU_BLOCK_COUNT;i++) + { + pTracker[0].BlockStartingTracklet()[i].x = pTracker[0].BlockStartingTracklet()[i].y = 0; + } +#endif - // reconstruction of tracklets - if ( iSync == 0 ) { - Step0( nBlocks, nThreads, iBlock, iThread, s, r, tracker, tParam ); - } else if ( iSync == 1 ) { - Step1( nBlocks, nThreads, iBlock, iThread, s, r, tracker, tParam ); - } else if ( iSync == 2 ) { - Step2( nBlocks, nThreads, iBlock, iThread, s, r, tracker, tParam ); - } + GPUshared() AliHLTTPCCASharedMemory sMem; - else if ( iSync == 3 ) - - { - r.fCurrentData = 1; - ReadData( iThread, s, r, tracker, s.fMinStartRow ); - r.fCurrentData = 0; - r.fNMissed = 0; - } else if ( iSync == 3 + 159 + 1 ) { - r.fCurrentData = 1; - int nextRow = s.fMaxEndRow; - if ( nextRow < 0 ) nextRow = 0; - ReadData( iThread, s, r, tracker, nextRow ); - r.fCurrentData = 0; - r.fNMissed = 0; - r.fStage = 2; - if ( r.fGo ) { - const AliHLTTPCCARow &row = tracker.Row( r.fEndRow ); - float x = row.X(); - if ( !tParam.TransportToX( x, tracker.Param().ConstBz(), .999 ) ) r.fGo = 0; - } - } +#ifdef HLTCA_GPU_SCHED_FIXED_START + if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS) + { + sMem.fNextTrackletFirstRun = 1; + } + __syncthreads(); +#endif - else if ( iSync <= 3 + 159 + 1 + 159 ) { - int iRow, nextRow; - if ( iSync <= 3 + 159 ) { - iRow = iSync - 4; - if ( iRow < s.fMinStartRow ) return; - nextRow = iRow + 1; - if ( nextRow > 158 ) nextRow = 158; - } else { - iRow = 158 - ( iSync - 4 - 159 - 1 ); - if ( iRow > s.fMaxEndRow ) return; - nextRow = iRow - 1; - if ( nextRow < 0 ) nextRow = 0; - } +#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE + if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS) + { + sMem.fMaxSync = 0; + } + int threadSync = 0; +#endif - if ( r.fIsMemThread ) { - ReadData( iThread, s, r, tracker, nextRow ); - } else { - UpdateTracklet( nBlocks, nThreads, iBlock, iThread, - s, r, tracker, tParam, iRow ); - } - r.fCurrentData = !r.fCurrentData; - } + for (int iReverse = 0;iReverse < 2;iReverse++) + { + for (int iRowBlock = 0;iRowBlock < HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1;iRowBlock++) + { +#ifdef HLTCA_GPU_SCHED_FIXED_SLICE + int iSlice = tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + (HLTCA_GPU_BLOCK_COUNT % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % HLTCA_GPU_BLOCK_COUNT != 0)) / HLTCA_GPU_BLOCK_COUNT; +#else + for (int iSlice = 0;iSlice < pTracker[0].GPUParametersConst()->fGPUnSlices;iSlice++) +#endif + { + AliHLTTPCCATracker &tracker = pTracker[iSlice]; + if (sMem.fNextTrackletFirstRun && iSlice != tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + (HLTCA_GPU_BLOCK_COUNT % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % HLTCA_GPU_BLOCK_COUNT != 0)) / HLTCA_GPU_BLOCK_COUNT) + { + continue; + } + /*if (!sMem.fNextTrackletFirstRun && tracker.RowBlockPos(1, HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP)->x <= tracker.RowBlockPos(1, HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP)->y) + { + continue; + }*/ + int sharedRowsInitialized = 0; + + int iTracklet; + int mustInit; + while ((iTracklet = FetchTracklet(tracker, sMem, iReverse, iRowBlock, mustInit)) != -2) + { +#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE + CAMath::AtomicMax(&sMem.fMaxSync, threadSync); + __syncthreads(); + threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / blockDim.x / gridDim.x); +#endif +#ifndef HLTCA_GPU_PREFETCHDATA + if (!sharedRowsInitialized) + { +#ifdef HLTCA_GPU_PREFETCH_ROWBLOCK_ONLY + if (iReverse) + { + for (int i = CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP) * sizeof(AliHLTTPCCARow) / sizeof(int) + threadIdx.x;i < (HLTCA_ROW_COUNT - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP) * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x) + { + reinterpret_cast(&sMem.fRows)[i] = reinterpret_cast(tracker.SliceDataRows())[i]; + } + } + else + { + for (int i = CAMath::Max(1, iRowBlock * HLTCA_GPU_SCHED_ROW_STEP) * sizeof(AliHLTTPCCARow) / sizeof(int) + threadIdx.x;i < CAMath::Min((iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP, HLTCA_ROW_COUNT) * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x) + { + reinterpret_cast(&sMem.fRows)[i] = reinterpret_cast(tracker.SliceDataRows())[i]; + } + } +#else + for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x) + { + reinterpret_cast(&sMem.fRows)[i] = reinterpret_cast(tracker.SliceDataRows())[i]; + } +#endif + sharedRowsInitialized = 1; + } +#endif +#ifdef HLTCA_GPU_RESCHED + short2 storeToRowBlock; + int storePosition = 0; + if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1)) + { + const int nReverse = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); + const int nRowBlock = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); + sMem.fTrackletStoreCount[nReverse][nRowBlock] = 0; + } +#endif + __syncthreads(); + AliHLTTPCCATrackParam tParam; + AliHLTTPCCAThreadMemory rMem; - else if ( iSync == 4 + 159*2 + 1 + 1 ) { // - StoreTracklet( nBlocks, nThreads, iBlock, iThread, - s, r, tracker, tParam ); - } + rMem.fCurrentData = 0; + +#ifdef HLTCA_GPU_EMULATION_DEBUG_TRACKLET + if (iTracklet == HLTCA_GPU_EMULATION_DEBUG_TRACKLET) + { + tracker.GPUParameters()->fGPUError = 1; + } +#endif + AliHLTTPCCAThreadMemory &rMemGlobal = tracker.GPUTrackletTemp()[iTracklet].fThreadMem; + AliHLTTPCCATrackParam &tParamGlobal = tracker.GPUTrackletTemp()[iTracklet].fParam; + if (mustInit) + { + AliHLTTPCCAHitId id = tracker.TrackletStartHits()[iTracklet]; + + rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex(); + rMem.fCurrIH = id.HitIndex(); + rMem.fStage = 0; + rMem.fNHits = 0; + rMem.fNMissed = 0; + + AliHLTTPCCATrackletConstructor::InitTracklet(tParam); + } + else if (iTracklet >= 0) + { + CopyTrackletTempData( rMemGlobal, rMem, tParamGlobal, tParam ); + } +#ifdef HLTCA_GPU_PREFETCHDATA + else if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS) + { + ReadData(threadIdx.x, sMem, rMem, tracker, iReverse ? (HLTCA_ROW_COUNT - 1 - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP) : (CAMath::Max(1, iRowBlock * HLTCA_GPU_SCHED_ROW_STEP))); + } +#endif + rMem.fItr = iTracklet; + rMem.fGo = (iTracklet >= 0); + +#ifdef HLTCA_GPU_RESCHED + storeToRowBlock.x = iRowBlock + 1; + storeToRowBlock.y = iReverse; +#ifdef HLTCA_GPU_PREFETCHDATA + rMem.fCurrentData ^= 1; + __syncthreads(); +#endif + if (iReverse) + { + for (int j = HLTCA_ROW_COUNT - 1 - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP;j >= CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP);j--) + { +#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE + if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(j >= rMem.fEndRow || ( j >= rMem.fStartRow && j - rMem.fStartRow % 2 == 0))) + pTracker[0].fStageAtSync[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1; +#endif +#ifdef HLTCA_GPU_PREFETCHDATA + if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS && j > CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP)) + { + ReadData(threadIdx.x, sMem, rMem, tracker, j - 1); + } + else +#endif + if (iTracklet >= 0) + { + UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j); + if (rMem.fNMissed > kMaxRowGap) + { + rMem.fGo = 0; +#ifndef HLTCA_GPU_PREFETCHDATA + break; +#endif + } + } +#ifdef HLTCA_GPU_PREFETCHDATA + __syncthreads(); + rMem.fCurrentData ^= 1; +#endif + } + + if (iTracklet >= 0 && (!rMem.fGo || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP)) + { + StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam ); + } + } + else + { + for (int j = CAMath::Max(1, iRowBlock * HLTCA_GPU_SCHED_ROW_STEP);j < CAMath::Min((iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP, HLTCA_ROW_COUNT);j++) + { +#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE + if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && j >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (j - rMem.fStartRow) % 2 == 0 )) + pTracker[0].fStageAtSync[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1; +#endif +#ifdef HLTCA_GPU_PREFETCHDATA + if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS && j < CAMath::Min((iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP, HLTCA_ROW_COUNT) - 1) + { + ReadData(threadIdx.x, sMem, rMem, tracker, j + 1); + } + else +#endif + if (iTracklet >= 0) + { + UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j); +#ifndef HLTCA_GPU_PREFETCHDATA + //if (rMem.fNMissed > kMaxRowGap || rMem.fGo == 0) break; //DR!!! CUDA Crashes with this enabled +#endif + } +#ifdef HLTCA_GPU_PREFETCHDATA + __syncthreads(); + rMem.fCurrentData ^= 1; +#endif + } + if (rMem.fGo && (rMem.fNMissed > kMaxRowGap || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP)) + { +#if defined(HLTCA_GPU_PREFETCHDATA) | !defined(HLTCA_GPU_PREFETCH_ROWBLOCK_ONLY) + if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) ) +#else + if ( !tParam.TransportToX( sMem.fRows[ rMem.fEndRow ].X(), tracker.Param().ConstBz(), .999 ) ) +#endif + { + rMem.fGo = 0; + } + else + { + storeToRowBlock.x = (HLTCA_ROW_COUNT - rMem.fEndRow) / HLTCA_GPU_SCHED_ROW_STEP; + storeToRowBlock.y = 1; + rMem.fNMissed = 0; + rMem.fStage = 2; + } + } + + if (iTracklet >= 0 && !rMem.fGo) + { + StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam ); + } + } + + if (rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0)) + { + CopyTrackletTempData( rMem, rMemGlobal, tParam, tParamGlobal ); + storePosition = CAMath::AtomicAdd(&sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x], 1); + } +#else + if (iTracklet >= 0) + { + for (int j = rMem.fStartRow;j < HLTCA_ROW_COUNT;j++) + { + UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j); + if (!rMem.fGo) break; + } + + rMem.fNMissed = 0; + rMem.fStage = 2; + if ( rMem.fGo ) + { + if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) ) rMem.fGo = 0; + } + + for (int j = rMem.fEndRow;j >= 0;j--) + { + if (!rMem.fGo) break; + UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j); + } + + StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam ); + } +#endif +#ifdef HLTCA_GPU_RESCHED + __syncthreads(); + if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1)) + { + const int nReverse = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); + const int nRowBlock = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); + if (sMem.fTrackletStoreCount[nReverse][nRowBlock]) + { + sMem.fTrackletStoreCount[nReverse][nRowBlock] = CAMath::AtomicAdd(&tracker.RowBlockPos(nReverse, nRowBlock)->x, sMem.fTrackletStoreCount[nReverse][nRowBlock]); + } + } + __syncthreads(); + if (iTracklet >= 0 && rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0)) + { + tracker.RowBlockTracklets(storeToRowBlock.y, storeToRowBlock.x)[sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x] + storePosition] = iTracklet; + } + __syncthreads(); +#endif + } + } + } + } +#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE + +#endif } +GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit(int iTracklet, AliHLTTPCCATracker &tracker) +{ + //Initialize Row Blocks + +#ifndef HLTCA_GPU_EMULATION_SINGLE_TRACKLET +AliHLTTPCCAHitId id = tracker.TrackletStartHits()[iTracklet]; +#ifdef HLTCA_GPU_SCHED_FIXED_START + const int firstDynamicTracklet = tracker.GPUParameters()->fScheduleFirstDynamicTracklet; + if (iTracklet >= firstDynamicTracklet) +#endif + { + const int firstTrackletInRowBlock = CAMath::Max(firstDynamicTracklet, tracker.RowStartHitCountOffset()[CAMath::Max(id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP * HLTCA_GPU_SCHED_ROW_STEP, 1)].z); + if (iTracklet == firstTrackletInRowBlock) + { + const int firstRowInNextBlock = (id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_SCHED_ROW_STEP; + int trackletsInRowBlock; + if (firstRowInNextBlock >= HLTCA_ROW_COUNT - 3) + trackletsInRowBlock = *tracker.NTracklets() - firstTrackletInRowBlock; + else + trackletsInRowBlock = CAMath::Max(firstDynamicTracklet, tracker.RowStartHitCountOffset()[firstRowInNextBlock].z) - firstTrackletInRowBlock; + + tracker.RowBlockPos(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)->x = trackletsInRowBlock; + tracker.RowBlockPos(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)->w = trackletsInRowBlock; + } + tracker.RowBlockTracklets(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)[iTracklet - firstTrackletInRowBlock] = iTracklet; + } +#endif +} + +GPUg() void AliHLTTPCCATrackletConstructorInit(int iSlice) +{ + //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit + AliHLTTPCCATracker &tracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[iSlice]; + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= *tracker.NTracklets()) return; + AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit(i, tracker); +} + +GPUg() void AliHLTTPCCATrackletConstructorNewGPU() +{ + //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorNewGPU + AliHLTTPCCATracker *pTracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker ); + AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorNewGPU(pTracker); +} + +#else +GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorNewCPU(AliHLTTPCCATracker &tracker) +{ + //Tracklet constructor simple CPU Function that does not neew a scheduler + GPUshared() AliHLTTPCCASharedMemory sMem; + sMem.fNTracklets = *tracker.NTracklets(); + for (int iTracklet = 0;iTracklet < *tracker.NTracklets();iTracklet++) + { + AliHLTTPCCATrackParam tParam; + AliHLTTPCCAThreadMemory rMem; + + AliHLTTPCCAHitId id = tracker.TrackletStartHits()[iTracklet]; + + rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex(); + rMem.fCurrIH = id.HitIndex(); + rMem.fStage = 0; + rMem.fNHits = 0; + rMem.fNMissed = 0; + + AliHLTTPCCATrackletConstructor::InitTracklet(tParam); + + rMem.fItr = iTracklet; + rMem.fGo = 1; + + for (int j = rMem.fStartRow;j < tracker.Param().NRows();j++) + { + UpdateTracklet(1, 1, 0, iTracklet, sMem, rMem, tracker, tParam, j); + if (!rMem.fGo) break; + } + + rMem.fNMissed = 0; + rMem.fStage = 2; + if ( rMem.fGo ) + { + if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) ) rMem.fGo = 0; + } + + for (int j = rMem.fEndRow;j >= 0;j--) + { + if (!rMem.fGo) break; + UpdateTracklet( 1, 1, 0, iTracklet, sMem, rMem, tracker, tParam, j); + } + + StoreTracklet( 1, 1, 0, iTracklet, sMem, rMem, tracker, tParam ); + } +} +#endif diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.h index 2df55b28620..dfd272c8892 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.h @@ -9,13 +9,22 @@ #ifndef ALIHLTTPCCATRACKLETCONSTRUCTOR_H #define ALIHLTTPCCATRACKLETCONSTRUCTOR_H +#ifdef HLTCA_GPUCODE +#define HLTCA_GPU_USE_INT short +#else +#define HLTCA_GPU_USE_INT int +#endif #include "AliHLTTPCCADef.h" +#include "AliHLTTPCCAGPUConfig.h" +#include "AliHLTTPCCATrackParam.h" /** * @class AliHLTTPCCATrackletConstructor * */ +class AliHLTTPCCATracker; + class AliHLTTPCCATrackletConstructor { public: @@ -26,24 +35,33 @@ class AliHLTTPCCATrackletConstructor public: #if !defined(HLTCA_GPUCODE) AliHLTTPCCASharedMemory() - : fItr0( 0 ), fItr1( 0 ), fNRows( 0 ), fMinStartRow( 0 ), fMaxEndRow( 0 ) {} + : fNextTrackletFirst(0), fNextTrackletCount(0), fNextTrackletNoDummy(0), fNextTrackletStupidDummy(0), fNextTrackletFirstRun(0), fNTracklets(0), fSliceDone(0) {} AliHLTTPCCASharedMemory( const AliHLTTPCCASharedMemory& /*dummy*/ ) - : fItr0( 0 ), fItr1( 0 ), fNRows( 0 ), fMinStartRow( 0 ), fMaxEndRow( 0 ) {} + : fNextTrackletFirst(0), fNextTrackletCount(0), fNextTrackletNoDummy(0), fNextTrackletStupidDummy(0), fNextTrackletFirstRun(0), fNTracklets(0), fSliceDone(0) {} AliHLTTPCCASharedMemory& operator=( const AliHLTTPCCASharedMemory& /*dummy*/ ) { return *this; } #endif -#ifndef CUDA_DEVICE_EMULATION protected: +#ifdef HLTCA_GPU_PREFETCHDATA + uint4 fData[2][ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4]; // temp memory + AliHLTTPCCARow fRow[2]; // row +#else + AliHLTTPCCARow fRows[HLTCA_ROW_COUNT]; +#endif + int fNextTrackletFirst; + int fNextTrackletCount; + int fNextTrackletNoDummy; + int fNextTrackletStupidDummy; + int fNextTrackletFirstRun; + int fNTracklets; + int fSliceDone; + +#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE + int fMaxSync; #endif - uint4 fData[2][( ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM+ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM+ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM )/4]; // temp memory - int fItr0; // start track index - int fItr1; // end track index - int fNRows; // n rows - int fMinStartRow; // min start row - int fMinStartRow32[32]; // min start row for each thread in warp - int fMaxEndRow; // max start row + int fTrackletStoreCount[2][HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1]; }; class AliHLTTPCCAThreadMemory @@ -52,25 +70,21 @@ class AliHLTTPCCATrackletConstructor public: #if !defined(HLTCA_GPUCODE) AliHLTTPCCAThreadMemory() - : fItr( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fStartRow( 0 ), fEndRow( 0 ), fCurrIH( 0 ), fIsMemThread( 0 ), fGo( 0 ), fSave( 0 ), fCurrentData( 0 ), fStage( 0 ), fNHits( 0 ), fNMissed( 0 ), fLastY( 0 ), fLastZ( 0 ) {} + : fItr( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fStartRow( 0 ), fEndRow( 0 ), fCurrIH( 0 ), fGo( 0 ), fCurrentData( 0 ), fStage( 0 ), fNHits( 0 ), fNMissed( 0 ), fLastY( 0 ), fLastZ( 0 ) {} AliHLTTPCCAThreadMemory( const AliHLTTPCCAThreadMemory& /*dummy*/ ) - : fItr( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fStartRow( 0 ), fEndRow( 0 ), fCurrIH( 0 ), fIsMemThread( 0 ), fGo( 0 ), fSave( 0 ), fCurrentData( 0 ), fStage( 0 ), fNHits( 0 ), fNMissed( 0 ), fLastY( 0 ), fLastZ( 0 ) {} + : fItr( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fStartRow( 0 ), fEndRow( 0 ), fCurrIH( 0 ), fGo( 0 ), fCurrentData( 0 ), fStage( 0 ), fNHits( 0 ), fNMissed( 0 ), fLastY( 0 ), fLastZ( 0 ) {} AliHLTTPCCAThreadMemory& operator=( const AliHLTTPCCAThreadMemory& /*dummy*/ ) { return *this; } #endif -#ifndef CUDA_DEVICE_EMULATION protected: -#endif int fItr; // track index int fFirstRow; // first row index int fLastRow; // last row index int fStartRow; // first row index int fEndRow; // first row index int fCurrIH; // indef of the current hit - bool fIsMemThread; // is the thread used for memory taken bool fGo; // do fit/searching flag - bool fSave; // save flag bool fCurrentData; // index of the current memory array int fStage; // reco stage int fNHits; // n track hits @@ -79,21 +93,13 @@ class AliHLTTPCCATrackletConstructor float fLastZ; // Z of the last fitted cluster }; - GPUd() static int NThreadSyncPoints() { return 4 + 159*2 + 1 + 1; } + struct AliHLTTPCCAGPUTempMemory + { + AliHLTTPCCAThreadMemory fThreadMem; + AliHLTTPCCATrackParam fParam; + }; - GPUd() static void Thread( int nBlocks, int nThreads, int iBlock, int iThread, - int iSync, AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, - AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam ); - - GPUd() static void Step0 - ( int nBlocks, int nThreads, int iBlock, int iThread, - AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam ); - GPUd() static void Step1 - ( int nBlocks, int nThreads, int iBlock, int iThread, - AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam ); - GPUd() static void Step2 - ( int nBlocks, int nThreads, int iBlock, int iThread, - AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam ); + GPUd() static void InitTracklet ( AliHLTTPCCATrackParam &tParam ); GPUd() static void ReadData( int iThread, AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, int iRow ); @@ -105,11 +111,20 @@ class AliHLTTPCCATrackletConstructor ( int nBlocks, int nThreads, int iBlock, int iThread, AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam ); +#ifdef HLTCA_GPUCODE + GPUd() static void AliHLTTPCCATrackletConstructorNewGPU(AliHLTTPCCATracker *pTracker); + GPUd() static int FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, int Reverse, int RowBlock, int &mustInit); + GPUd() static void AliHLTTPCCATrackletConstructorInit(int iTracklet, AliHLTTPCCATracker &tracke); + GPUd() static void CopyTrackletTempData( AliHLTTPCCAThreadMemory &rMemSrc, AliHLTTPCCAThreadMemory &rMemDst, AliHLTTPCCATrackParam &tParamSrc, AliHLTTPCCATrackParam &tParamDst); +#else + GPUd() static void AliHLTTPCCATrackletConstructorNewCPU(AliHLTTPCCATracker &tracker); +#endif + GPUd() static bool SAVE() { return 1; } #if defined(HLTCA_GPUCODE) //GPUhd() inline int NMemThreads() { return 128; } -#define TRACKLET_CONSTRUCTOR_NMEMTHREDS 128 +#define TRACKLET_CONSTRUCTOR_NMEMTHREDS HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS #else //GPUhd() inline int NMemThreads() { return 1; } #define TRACKLET_CONSTRUCTOR_NMEMTHREDS 1 @@ -118,4 +133,5 @@ class AliHLTTPCCATrackletConstructor }; + #endif diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletSelector.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletSelector.cxx index 72aa51e45a0..16608564d4d 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletSelector.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletSelector.cxx @@ -33,24 +33,31 @@ GPUd() void AliHLTTPCCATrackletSelector::Thread if ( iSync == 0 ) { if ( iThread == 0 ) { - if ( iBlock == 0 ) { - CAMath::AtomicExch( tracker.NTracks(), 0 ); - CAMath::AtomicExch( tracker.NTrackHits(), 0 ); - } s.fNTracklets = *tracker.NTracklets(); s.fNThreadsTotal = nThreads * nBlocks; s.fItr0 = nThreads * iBlock; } } else if ( iSync == 1 ) { - AliHLTTPCCATrack tout; - AliHLTTPCCAHitId trackHits[160]; + int nHits, nFirstTrackHit; + AliHLTTPCCAHitId trackHits[160 - HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE]; for ( int itr = s.fItr0 + iThread; itr < s.fNTracklets; itr += s.fNThreadsTotal ) { - AliHLTTPCCATracklet &tracklet = tracker.Tracklets()[itr]; +#ifdef HLTCA_GPU_EMULATION_DEBUG_TRACKLET + if (itr == HLTCA_GPU_EMULATION_DEBUG_TRACKLET) + { + tracker.GPUParameters()->fGPUError = 1; + } +#endif + + while (tracker.Tracklets()[itr].NHits() == 0) + { + itr += s.fNThreadsTotal; + if (itr >= s.fNTracklets) return; + } - int tNHits = tracklet.NHits(); - if ( tNHits <= 0 ) continue; + AliHLTTPCCATracklet &tracklet = tracker.Tracklets()[itr]; + const int tNHits = tracklet.NHits(); const int kMaxRowGap = 4; const float kMaxShared = .1; @@ -58,7 +65,6 @@ GPUd() void AliHLTTPCCATrackletSelector::Thread int firstRow = tracklet.FirstRow(); int lastRow = tracklet.LastRow(); - tout.SetNHits( 0 ); int kind = 0; if ( 0 ) { if ( tNHits >= 10 && 1. / .5 >= CAMath::Abs( tracklet.Param().QPt() ) ) { //SG!!! @@ -70,40 +76,70 @@ GPUd() void AliHLTTPCCATrackletSelector::Thread //int w = (tNHits<<16)+itr; //int nRows = tracker.Param().NRows(); + //std::cout<<" store tracklet: "<= TRACKLET_SELECTOR_MIN_HITS; irow++ ) { gap++; - int ih = tracklet.RowHit( irow ); +#ifdef EXTERN_ROW_HITS + int ih = tracker.TrackletRowHits()[irow * s.fNTracklets + itr]; +#else + int ih = tracklet.RowHit( irow ); +#endif if ( ih >= 0 ) { const AliHLTTPCCARow &row = tracker.Row( irow ); bool own = ( tracker.HitWeight( row, ih ) <= w ); - bool sharedOK = ( ( tout.NHits() < 0 ) || ( nShared < tout.NHits() * kMaxShared ) ); + bool sharedOK = ( ( nShared < nHits * kMaxShared ) ); if ( own || sharedOK ) {//SG!!! gap = 0; - trackHits[tout.NHits()].Set( irow, ih ); - tout.SetNHits( tout.NHits() + 1 ); +#if HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE != 0 + if (nHits < HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE) + s.fHits[iThread][nHits].Set( irow, ih ); + else +#endif + trackHits[nHits - HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE].Set( irow, ih ); + nHits++; if ( !own ) nShared++; } } if ( gap > kMaxRowGap || irow == lastRow ) { // store - if ( tout.NHits() >= 10 ) { //SG!!! + if ( nHits >= TRACKLET_SELECTOR_MIN_HITS ) { //SG!!! int itrout = CAMath::AtomicAdd( tracker.NTracks(), 1 ); - tout.SetFirstHitID( CAMath::AtomicAdd( tracker.NTrackHits(), tout.NHits() ) ); - tout.SetParam( tracklet.Param() ); - tout.SetAlive( 1 ); - tracker.Tracks()[itrout] = tout; - for ( int jh = 0; jh < tout.NHits(); jh++ ) { - tracker.TrackHits()[tout.FirstHitID() + jh] = trackHits[jh]; +#ifdef HLTCA_GPUCODE + if (itrout >= HLTCA_GPU_MAX_TRACKS) + { + tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_TRACK_OVERFLOW; + CAMath::AtomicExch( tracker.NTracks(), 0 ); + return; + } +#endif + nFirstTrackHit = CAMath::AtomicAdd( tracker.NTrackHits(), nHits ); + tracker.Tracks()[itrout].SetAlive(1); + tracker.Tracks()[itrout].SetParam(tracklet.Param()); + tracker.Tracks()[itrout].SetFirstHitID(nFirstTrackHit); + tracker.Tracks()[itrout].SetNHits(nHits); + for ( int jh = 0; jh < nHits; jh++ ) { +#if HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE != 0 + if (jh < HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE) + tracker.TrackHits()[nFirstTrackHit + jh] = s.fHits[iThread][jh]; + else +#endif + tracker.TrackHits()[nFirstTrackHit + jh] = trackHits[jh - HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE]; } } - tout.SetNHits( 0 ); + nHits = 0; gap = 0; nShared = 0; } } + + } } } diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletSelector.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletSelector.h index 7fb238bf98a..a08f452e57b 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletSelector.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletSelector.h @@ -11,6 +11,8 @@ #include "AliHLTTPCCADef.h" +#include "AliHLTTPCCAHitId.h" +#include "AliHLTTPCCAGPUConfig.h" class AliHLTTPCCATracker; /** @@ -23,15 +25,15 @@ class AliHLTTPCCATrackletSelector class AliHLTTPCCASharedMemory { friend class AliHLTTPCCATrackletSelector; -#ifndef CUDA_DEVICE_EMULATION - protected: -#else - public: -#endif + + protected: int fItr0; // index of the first track in the block int fNThreadsTotal; // total n threads int fNTracklets; // n of tracklets - }; +#if HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE != 0 + AliHLTTPCCAHitId fHits[HLTCA_GPU_THREAD_COUNT][HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE]; +#endif + }; GPUd() static int NThreadSyncPoints() { return 1; } diff --git a/HLT/TPCLib/tracking-ca/AliTPCtrackerCA.cxx b/HLT/TPCLib/tracking-ca/AliTPCtrackerCA.cxx index f4ff3cf0bb5..7c7c4121437 100644 --- a/HLT/TPCLib/tracking-ca/AliTPCtrackerCA.cxx +++ b/HLT/TPCLib/tracking-ca/AliTPCtrackerCA.cxx @@ -144,7 +144,8 @@ AliTPCtrackerCA::AliTPCtrackerCA( const AliTPCParam *par ): } } } - hlt.SliceTracker( iSlice ).Initialize( param ); + //hlt.SliceTracker( iSlice ).Initialize( param ); + hlt.InitializeSliceParam(iSlice, param); } } @@ -408,15 +409,16 @@ int AliTPCtrackerCA::Clusters2Tracks( AliESDEvent *event ) tTPC.SetClusterPointer( row, c ); AliTPCTrackerPoint &point = *( tTPC.GetTrackPoint( row ) ); { - AliHLTTPCCATracker &slice = hlt.SliceTracker( iSlice ); - if ( slice.Param().Alpha() != alpha ) { - if ( ! t0.Rotate( slice.Param().Alpha() - alpha, .999 ) ) continue; - alpha = slice.Param().Alpha(); + //AliHLTTPCCATracker &slice = hlt.SliceTracker( iSlice ); + if ( hlt.Param(iSlice).Alpha() != alpha ) { + if ( ! t0.Rotate( hlt.Param(iSlice).Alpha() - alpha, .999 ) ) continue; + alpha = hlt.Param(iSlice).Alpha(); } - float x = slice.Row( row ).X(); - if ( !t0.TransportToX( x, slice.Param().GetBz( t0 ), .999 ) ) continue; + float x = hlt.Row(iSlice, row).X(); + if ( !t0.TransportToX( x, hlt.Param(iSlice).GetBz( t0 ), .999 ) ) continue; float sy2, sz2; - slice.GetErrors2( row, t0, sy2, sz2 ); + //slice.GetErrors2( row, t0, sy2, sz2 ); + hlt.Param(iSlice).GetClusterErrors2( row, t0.GetZ(), t0.SinPhi(), t0.GetCosPhi(), t0.DzDs(), sy2, sz2 ); point.SetSigmaY( c->GetSigmaY2() / sy2 ); point.SetSigmaZ( c->GetSigmaZ2() / sz2 ); point.SetAngleY( TMath::Abs( t0.GetSinPhi() / t0.GetCosPhi() ) ); diff --git a/HLT/TPCLib/tracking-ca/MemoryAssignmentHelpers.h b/HLT/TPCLib/tracking-ca/MemoryAssignmentHelpers.h index 8cf3e4191f1..2a48f76105a 100644 --- a/HLT/TPCLib/tracking-ca/MemoryAssignmentHelpers.h +++ b/HLT/TPCLib/tracking-ca/MemoryAssignmentHelpers.h @@ -33,7 +33,7 @@ GPUhd() static inline void AlignTo( char *&mem ) } template -static inline unsigned int NextMultipleOf( unsigned int value ) +GPUhd() static inline unsigned int NextMultipleOf( unsigned int value ) { STATIC_ASSERT( ( X & ( X - 1 ) ) == 0, X_needs_to_be_a_multiple_of_2 ); const int offset = value & ( X - 1 ); -- 2.39.3