From d3821846f1dbc79c857a5a526d884ca5e3daedb2 Mon Sep 17 00:00:00 2001 From: drohr Date: Sun, 27 Oct 2013 11:35:32 +0000 Subject: [PATCH] Update NVIDIA GPU Tracking library to be compatible to AliRoot patch 64473, add preliminary version of OpenCL based GPU Tracking library, add #ifdefs to tracker code to exclude AliRoot code from the OpenCL source code which breaks the OpenCL build --- HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h | 2 +- HLT/TPCLib/tracking-ca/AliHLTTPCCAMath.h | 2 +- HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h | 2 + HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx | 2 +- HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h | 2 + .../AliHLTTPCCATrackerFramework.cxx | 4 +- .../cagpu/AliHLTTPCCAGPUTrackerBase.cxx | 1030 ++++++++++++ .../cagpu/AliHLTTPCCAGPUTrackerBase.h | 207 +++ .../cagpu/AliHLTTPCCAGPUTrackerCommon.h | 28 + .../cagpu/AliHLTTPCCAGPUTrackerNVCC.cu | 1394 +++-------------- ...PUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch | 122 -- .../cagpu/AliHLTTPCCAGPUTrackerNVCC.h | 154 +- .../cagpu/AliHLTTPCCAGPUTrackerOpenCL.cl | 113 ++ .../cagpu/AliHLTTPCCAGPUTrackerOpenCL.cxx | 810 ++++++++++ .../cagpu/AliHLTTPCCAGPUTrackerOpenCL.h | 65 + .../AliHLTTPCCAGPUTrackerOpenCLInternals.h | 40 + .../cagpu/AliHLTTPCCATrackletConstructorGPU.h | 269 ++-- HLT/TPCLib/tracking-ca/cagpu/makefile | 61 +- .../tracking-ca/cagpu/makefiles/include.S | 8 + .../makefiles/makefile_opencl_compiler.cpp | 232 +++ .../cagpu/makefiles/opencl_compiler_structs.h | 16 + .../cagpu/makefiles/opencl_obtain_program.h | 86 + 22 files changed, 3019 insertions(+), 1630 deletions(-) create mode 100644 HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.cxx create mode 100644 HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.h create mode 100644 HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerCommon.h delete mode 100755 HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch create mode 100644 HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cl create mode 100644 HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cxx create mode 100644 HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.h create mode 100644 HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCLInternals.h create mode 100644 HLT/TPCLib/tracking-ca/cagpu/makefiles/include.S create mode 100644 HLT/TPCLib/tracking-ca/cagpu/makefiles/makefile_opencl_compiler.cpp create mode 100644 HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_compiler_structs.h create mode 100644 HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_obtain_program.h diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h index 65ff3f45bd3..9e34bc38e3c 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h @@ -42,7 +42,7 @@ #endif //VSNET_RUNTIME #endif //R__WIN32 -#ifdef HLTCA_STANDALONE +#if defined(HLTCA_STANDALONE) || (defined(HLTCA_GPUCODE) && defined(__OPENCL__) && !defined(HLTCA_HOSTCODE)) // class TObject{}; diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAMath.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAMath.h index fa5cfccacff..e24b439ffaf 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCAMath.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCAMath.h @@ -72,7 +72,7 @@ typedef AliHLTTPCCAMath CAMath; #if defined( HLTCA_STANDALONE ) #define choiceA(c1,c2,c3) c2 #else //HLTCA_STANDALONE -#define choiceA(c1,c2,c3) c3 +#define choiceA(c1,c2,c3) c2 #endif //HLTCA_STANDALONE #else //HLTCA_HOSTCODE #define choiceA(c1, c2, c3) c2 diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h index 14914cd1164..04db579bf86 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h @@ -20,7 +20,9 @@ #include "AliHLTTPCCADef.h" #include "AliHLTTPCCARow.h" #include "AliHLTTPCCAMath.h" +#if !(defined(HLTCA_GPUCODE) && defined(__OPENCL__) && !defined(HLTCA_HOSTCODE)) #include "AliHLTArray.h" +#endif #include "AliHLTTPCCAGPUConfig.h" typedef int int_v; diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx index 387d76f8f25..bae170b2c39 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx @@ -24,7 +24,6 @@ #include "AliHLTTPCCAMath.h" #include "MemoryAssignmentHelpers.h" -#include "TStopwatch.h" #include "AliHLTTPCCAHitArea.h" #include "AliHLTTPCCANeighboursFinder.h" #include "AliHLTTPCCANeighboursCleaner.h" @@ -39,6 +38,7 @@ #include "AliHLTTPCCAGPUConfig.h" #if !defined(HLTCA_GPUCODE) +#include "TStopwatch.h" #include #include #include diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h index 3f353d3162c..7abf200b011 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h @@ -30,7 +30,9 @@ MEM_CLASS_PRE() class AliHLTTPCCATrackParam; class AliHLTTPCCAClusterData; MEM_CLASS_PRE() class AliHLTTPCCARow; +#if !(defined(HLTCA_GPUCODE) && defined(__OPENCL__) && !defined(HLTCA_HOSTCODE)) #include "TStopwatch.h" +#endif /** * @class AliHLTTPCCATracker diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx index 2d82fe33ae4..bcfbd7da7d0 100644 --- a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx +++ b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx @@ -229,9 +229,9 @@ AliHLTTPCCATrackerFramework::AliHLTTPCCATrackerFramework(int allowGPU, const cha if (allowGPU) { #ifndef R__WIN32 - HLTInfo("The following error occured during dlopen: %s", dlerror()); + HLTImportant("The following error occured during dlopen: %s", dlerror()); #endif - HLTError("Error Opening cagpu library for GPU Tracker, will fallback to CPU"); + HLTError("Error Opening cagpu library for GPU Tracker (%s), will fallback to CPU", GPU_Library == NULL ? "default: " GPULIBNAME : GPU_Library); } else { diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.cxx b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.cxx new file mode 100644 index 00000000000..06a086186a6 --- /dev/null +++ b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.cxx @@ -0,0 +1,1030 @@ +// ************************************************************************** +// This file is property of and copyright by the ALICE HLT Project * +// ALICE Experiment at CERN, All rights reserved. * +// * +// Primary Authors: Sergey Gorbunov * +// Ivan Kisel * +// David Rohr * +// for The ALICE HLT Project. * +// * +// Permission to use, copy, modify and distribute this software and its * +// documentation strictly for non-commercial purposes is hereby granted * +// without fee, provided that the above copyright notice appears in all * +// copies and that both the copyright notice and this permission notice * +// appear in the supporting documentation. The authors make no claims * +// about the suitability of this software for any purpose. It is * +// provided "as is" without express or implied warranty. * +// * +//*************************************************************************** + +#include +#ifndef _WIN32 +#include +#endif +#include "AliHLTTPCCAGPUTrackerBase.h" +#include "AliHLTTPCCAClusterData.h" +#include "AliHLTTPCCAGPUTrackerCommon.h" + +ClassImp( AliHLTTPCCAGPUTrackerBase ) + +int AliHLTTPCCAGPUTrackerBase::GlobalTracking(int iSlice, int threadId, AliHLTTPCCAGPUTrackerBase::helperParam* hParam) +{ + if (fDebugLevel >= 3) {HLTDebug("GPU Tracker running Global Tracking for slice %d on thread %d\n", iSlice, threadId);} + + int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2); + int sliceRight = (iSlice + 1) % (fgkNSlices / 2); + if (iSlice >= fgkNSlices / 2) + { + sliceLeft += fgkNSlices / 2; + sliceRight += fgkNSlices / 2; + } + while (fSliceOutputReady < iSlice || fSliceOutputReady < sliceLeft || fSliceOutputReady < sliceRight) + { + if (hParam != NULL && hParam->fReset) return(1); + } + + pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]); + pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]); + fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS); + pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]); + pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]); + + fSliceLeftGlobalReady[sliceLeft] = 1; + fSliceRightGlobalReady[sliceRight] = 1; + if (fDebugLevel >= 3) {HLTDebug("GPU Tracker finished Global Tracking for slice %d on thread %d\n", iSlice, threadId);} + return(0); +} + +void* AliHLTTPCCAGPUTrackerBase::helperWrapper(void* arg) +{ + AliHLTTPCCAGPUTrackerBase::helperParam* par = (AliHLTTPCCAGPUTrackerBase::helperParam*) arg; + AliHLTTPCCAGPUTrackerBase* cls = par->fCls; + + AliHLTTPCCATracker* tmpTracker = new AliHLTTPCCATracker; + +#ifdef HLTCA_STANDALONE + if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d starting", par->fNum); +#endif + +#if defined(HLTCA_STANDALONE) & !defined(_WIN32) + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(par->fNum * 2 + 2, &mask); + //sched_setaffinity(0, sizeof(mask), &mask); +#endif + + while(pthread_mutex_lock(&((pthread_mutex_t*) par->fMutex)[0]) == 0 && par->fTerminate == false) + { + if (par->CPUTracker) + { + for (int i = 0;i < cls->fNSlicesPerCPUTracker;i++) + { + int myISlice = cls->fSliceCount - cls->fNCPUTrackers * cls->fNSlicesPerCPUTracker + (par->fNum - cls->fNHelperThreads) * cls->fNSlicesPerCPUTracker + i; +#ifdef HLTCA_STANDALONE + if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Doing full CPU tracking, Slice %d", par->fNum, myISlice); +#endif + if (myISlice >= 0) + { + tmpTracker->Initialize(cls->fSlaveTrackers[par->fFirstSlice + myISlice].Param()); + tmpTracker->ReadEvent(&par->pClusterData[myISlice]); + tmpTracker->DoTracking(); + tmpTracker->SetOutput(&par->pOutput[myISlice]); + pthread_mutex_lock((pthread_mutex_t*) cls->fHelperMemMutex); + tmpTracker->WriteOutputPrepare(); + pthread_mutex_unlock((pthread_mutex_t*) cls->fHelperMemMutex); + tmpTracker->WriteOutput(); + + /*cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUSliceDataMemory((char*) new uint4[HLTCA_GPU_SLICE_DATA_MEMORY/sizeof(uint4)], (char*) new uint4[HLTCA_GPU_ROWS_MEMORY/sizeof(uint4)]); + cls->fSlaveTrackers[par->fFirstSlice + myISlice].ReadEvent(&par->pClusterData[myISlice]); + cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracklets(HLTCA_GPU_MAX_TRACKLETS); + cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersHits(par->pClusterData[myISlice].NumberOfClusters()); + cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracks(HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters()); + cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTrackletsMemory(reinterpret_cast ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, cls->fConstructorBlockCount); + cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerHitsMemory(reinterpret_cast ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemorySize()/sizeof( uint4 ) + 100]), par->pClusterData[myISlice].NumberOfClusters()); + cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTracksMemory(reinterpret_cast ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemorySize()/sizeof( uint4 ) + 100]), HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters()); + cls->fSlaveTrackers[par->fFirstSlice + myISlice].DoTracking(); + cls->WriteOutput(par->pOutput, par->fFirstSlice, myISlice, par->fNum + 1); + delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemory(); + delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemory(); + delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemory();*/ + } +#ifdef HLTCA_STANDALONE + if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d", par->fNum, myISlice); +#endif + } + } + else + { + int mustRunSlice19 = 0; + for (int i = par->fNum + 1;i < par->fSliceCount;i += cls->fNHelperThreads + 1) + { + //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Running, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase); + if (par->fPhase) + { + if (cls->fUseGlobalTracking) + { + int realSlice = i + 1; + if (realSlice % (fgkNSlices / 2) < 1) realSlice -= fgkNSlices / 2; + + if (realSlice % (fgkNSlices / 2) != 1) + { + cls->GlobalTracking(realSlice, par->fNum + 1, par); + } + + if (realSlice == 19) + { + mustRunSlice19 = 1; + } + else + { + while (cls->fSliceLeftGlobalReady[realSlice] == 0 || cls->fSliceRightGlobalReady[realSlice] == 0) + { + if (par->fReset) goto ResetHelperThread; + } + cls->WriteOutput(par->pOutput, par->fFirstSlice, realSlice, par->fNum + 1); + } + } + else + { + while (cls->fSliceOutputReady < i) + { + if (par->fReset) goto ResetHelperThread; + } + cls->WriteOutput(par->pOutput, par->fFirstSlice, i, par->fNum + 1); + } + } + else + { + cls->ReadEvent(par->pClusterData, par->fFirstSlice, i, par->fNum + 1); + par->fDone = i + 1; + } + //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase); + } + if (mustRunSlice19) + { + while (cls->fSliceLeftGlobalReady[19] == 0 || cls->fSliceRightGlobalReady[19] == 0) + { + if (par->fReset) goto ResetHelperThread; + } + cls->WriteOutput(par->pOutput, par->fFirstSlice, 19, par->fNum + 1); + } + } +ResetHelperThread: + cls->ResetThisHelperThread(par); + } +#ifdef HLTCA_STANDALONE + if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d terminating", par->fNum); +#endif + delete tmpTracker; + pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]); + pthread_exit(NULL); + return(NULL); +} + +void AliHLTTPCCAGPUTrackerBase::ResetThisHelperThread(AliHLTTPCCAGPUTrackerBase::helperParam* par) +{ + if (par->fReset) HLTImportant("GPU Helper Thread %d reseting", par->fNum); + par->fReset = false; + pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]); +} + +#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem" + +AliHLTTPCCAGPUTrackerBase::AliHLTTPCCAGPUTrackerBase() : +fGpuTracker(NULL), +fGPUMemory(NULL), +fHostLockedMemory(NULL), +fGPUMergerMemory(NULL), +fGPUMergerHostMemory(NULL), +fGPUMergerMaxMemory(0), +fDebugLevel(0), +fDebugMask(0xFFFFFFFF), +fOutFile(NULL), +fGPUMemSize(0), +fSliceCount(HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT), +fCudaDevice(0), +fOutputControl(NULL), +fThreadId(0), +fCudaInitialized(0), +fPPMode(0), +fSelfheal(0), +fConstructorBlockCount(30), +selectorBlockCount(30), +fNHelperThreads(HLTCA_GPU_DEFAULT_HELPER_THREADS), +fHelperParams(NULL), +fHelperMemMutex(NULL), +fSliceOutputReady(0), +fSliceGlobalMutexes(NULL), +fNCPUTrackers(0), +fNSlicesPerCPUTracker(0), +fGlobalTracking(0), +fUseGlobalTracking(0), +fNSlaveThreads(0) +{} + +AliHLTTPCCAGPUTrackerBase::~AliHLTTPCCAGPUTrackerBase() +{ +} + +void AliHLTTPCCAGPUTrackerBase::ReleaseGlobalLock(void* sem) +{ + //Release the global named semaphore that locks GPU Initialization +#ifdef R__WIN32 + HANDLE* h = (HANDLE*) sem; + ReleaseSemaphore(*h, 1, NULL); + CloseHandle(*h); + delete h; +#else + sem_t* pSem = (sem_t*) sem; + sem_post(pSem); + sem_unlink(SemLockName); +#endif +} + +int AliHLTTPCCAGPUTrackerBase::CheckMemorySizes(int sliceCount) +{ + //Check constants for correct memory sizes + if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY) + { + HLTError("Insufficiant Tracker Object Memory for %d slices", sliceCount); + return(1); + } + + if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY) + { + HLTError("Insufficiant Common Memory"); + return(1); + } + + if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY) + { + HLTError("Insufficiant Row Memory"); + return(1); + } + + if (fDebugLevel >= 3) + { + HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", (int) sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, (int) (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize()), HLTCA_GPU_COMMON_MEMORY, (int) (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow)), HLTCA_GPU_ROWS_MEMORY); + } + return(0); +} + +void AliHLTTPCCAGPUTrackerBase::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile) +{ + //Set Debug Level and Debug output File if applicable + fDebugLevel = dwLevel; + if (NewOutFile) fOutFile = NewOutFile; +} + +int AliHLTTPCCAGPUTrackerBase::SetGPUTrackerOption(char* OptionName, int OptionValue) +{ + //Set a specific GPU Tracker Option + if (strcmp(OptionName, "PPMode") == 0) + { + fPPMode = OptionValue; + } + else if (strcmp(OptionName, "DebugMask") == 0) + { + fDebugMask = OptionValue; + } + else if (strcmp(OptionName, "HelperThreads") == 0) + { + fNHelperThreads = OptionValue; + } + else if (strcmp(OptionName, "CPUTrackers") == 0) + { + fNCPUTrackers = OptionValue; + } + else if (strcmp(OptionName, "SlicesPerCPUTracker") == 0) + { + fNSlicesPerCPUTracker = OptionValue; + } + else if (strcmp(OptionName, "GlobalTracking") == 0) + { + fGlobalTracking = OptionValue; + } + else + { + HLTError("Unknown Option: %s", OptionName); + return(1); + } + + if (fNHelperThreads + fNCPUTrackers > fNSlaveThreads && fCudaInitialized) + { + HLTInfo("Insufficient Slave Threads available (%d), creating additional Slave Threads (%d+%d)\n", fNSlaveThreads, fNHelperThreads, fNCPUTrackers); + StopHelperThreads(); + StartHelperThreads(); + } + + return(0); +} + +#ifdef HLTCA_STANDALONE +void AliHLTTPCCAGPUTrackerBase::StandalonePerfTime(int iSlice, int i) +{ + //Run Performance Query for timer i of slice iSlice + if (fDebugLevel >= 1) + { + AliHLTTPCCATracker::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i)); + } +} +#else +void AliHLTTPCCAGPUTrackerBase::StandalonePerfTime(int /*iSlice*/, int /*i*/) {} +#endif + +int AliHLTTPCCAGPUTrackerBase::SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal) +{ + if (!fSelfheal) + { + ReleaseThreadContext(); + return(1); + } + static bool selfHealing = false; + if (selfHealing) + { + HLTError("Selfhealing failed, giving up"); + ReleaseThreadContext(); + return(1); + } + else + { + HLTError("Unsolvable CUDA error occured, trying to reinitialize GPU"); + } + selfHealing = true; + ExitGPU(); + if (InitGPU(fSliceCount, fCudaDevice)) + { + HLTError("Could not reinitialize CUDA device, disabling GPU tracker"); + ExitGPU(); + return(1); + } + HLTInfo("GPU tracker successfully reinitialized, restarting tracking"); + int retVal = Reconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal); + selfHealing = false; + return(retVal); +} + +void AliHLTTPCCAGPUTrackerBase::ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId) +{ + fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice)); +#ifdef HLTCA_GPU_TIME_PROFILE + unsigned long long int a, b; + AliHLTTPCCATracker::StandaloneQueryTime(&a); +#endif + fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]); +#ifdef HLTCA_GPU_TIME_PROFILE + AliHLTTPCCATracker::StandaloneQueryTime(&b); + HLTInfo("Read %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC); +#endif +} + +void AliHLTTPCCAGPUTrackerBase::WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId) +{ + if (fDebugLevel >= 3) {HLTDebug("GPU Tracker running WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);} + fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]); +#ifdef HLTCA_GPU_TIME_PROFILE + unsigned long long int a, b; + AliHLTTPCCATracker::StandaloneQueryTime(&a); +#endif + if (fNHelperThreads) pthread_mutex_lock((pthread_mutex_t*) fHelperMemMutex); + fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare(); + if (fNHelperThreads) pthread_mutex_unlock((pthread_mutex_t*) fHelperMemMutex); + fSlaveTrackers[firstSlice + iSlice].WriteOutput(); +#ifdef HLTCA_GPU_TIME_PROFILE + AliHLTTPCCATracker::StandaloneQueryTime(&b); + HLTInfo("Write %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC); +#endif + if (fDebugLevel >= 3) {HLTDebug("GPU Tracker finished WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId);} +} + +int AliHLTTPCCAGPUTrackerBase::InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m) +{ + //Initialize Slice Tracker Parameter for a slave tracker + fSlaveTrackers[iSlice].Initialize(param); + if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT) + { + HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT); + return(1); + } + return(0); +} + +void AliHLTTPCCAGPUTrackerBase::ResetHelperThreads(int helpers) +{ + HLTImportant("Error occurred, GPU tracker helper threads will be reset (Number of threads %d/%d)", fNHelperThreads, fNCPUTrackers); + SynchronizeGPU(); + ReleaseThreadContext(); + for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++) + { + fHelperParams[i].fReset = true; + if (helpers || i >= fNHelperThreads) pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]); + } + HLTImportant("GPU Tracker helper threads have ben reset"); +} + +int AliHLTTPCCAGPUTrackerBase::StartHelperThreads() +{ + int nThreads = fNHelperThreads + fNCPUTrackers; + if (nThreads) + { + fHelperParams = new helperParam[nThreads]; + if (fHelperParams == NULL) + { + HLTError("Memory allocation error"); + ExitGPU(); + return(1); + } + for (int i = 0;i < nThreads;i++) + { + fHelperParams[i].fCls = this; + fHelperParams[i].fTerminate = false; + fHelperParams[i].fReset = false; + fHelperParams[i].fNum = i; + fHelperParams[i].fMutex = malloc(2 * sizeof(pthread_mutex_t)); + if (fHelperParams[i].fMutex == NULL) + { + HLTError("Memory allocation error"); + ExitGPU(); + return(1); + } + for (int j = 0;j < 2;j++) + { + if (pthread_mutex_init(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j], NULL)) + { + HLTError("Error creating pthread mutex"); + ExitGPU(); + return(1); + } + + pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]); + } + fHelperParams[i].fThreadId = (void*) malloc(sizeof(pthread_t)); + + if (pthread_create((pthread_t*) fHelperParams[i].fThreadId, NULL, helperWrapper, &fHelperParams[i])) + { + HLTError("Error starting slave thread"); + ExitGPU(); + return(1); + } + } + } + fNSlaveThreads = nThreads; + return(0); +} + +int AliHLTTPCCAGPUTrackerBase::StopHelperThreads() +{ + if (fNSlaveThreads) + { + for (int i = 0;i < fNSlaveThreads;i++) + { + fHelperParams[i].fTerminate = true; + if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0])) + { + HLTError("Error unlocking mutex to terminate slave"); + return(1); + } + if (pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1])) + { + HLTError("Error locking mutex"); + return(1); + } + if (pthread_join( *((pthread_t*) fHelperParams[i].fThreadId), NULL)) + { + HLTError("Error waiting for thread to terminate"); + return(1); + } + free(fHelperParams[i].fThreadId); + for (int j = 0;j < 2;j++) + { + if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j])) + { + HLTError("Error unlocking mutex before destroying"); + return(1); + } + pthread_mutex_destroy(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]); + } + free(fHelperParams[i].fMutex); + } + delete[] fHelperParams; + } + fNSlaveThreads = 0; + return(0); +} + +void AliHLTTPCCAGPUTrackerBase::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val) +{ + //Set Output Control Pointers + fOutputControl = val; + for (int i = 0;i < fgkNSlices;i++) + { + fSlaveTrackers[i].SetOutputControl(val); + } +} + +int AliHLTTPCCAGPUTrackerBase::GetThread() +{ + //Get Thread ID +#ifdef R__WIN32 + return((int) (size_t) GetCurrentThread()); +#else + return((int) syscall (SYS_gettid)); +#endif +} + +unsigned long long int* AliHLTTPCCAGPUTrackerBase::PerfTimer(int iSlice, unsigned int i) +{ + //Returns pointer to PerfTimer i of slice iSlice + return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL); +} + +const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerBase::OutputControl() const +{ + //Return Pointer to Output Control Structure + return fOutputControl; +} + +int AliHLTTPCCAGPUTrackerBase::GetSliceCount() const +{ + //Return max slice count processable + return(fSliceCount); +} + +char* AliHLTTPCCAGPUTrackerBase::MergerBaseMemory() +{ + return(alignPointer((char*) fGPUMergerHostMemory, 1024 * 1024)); +} + +int AliHLTTPCCAGPUTrackerBase::IsInitialized() +{ + return(fCudaInitialized); +} + +int AliHLTTPCCAGPUTrackerBase::InitGPU(int sliceCount, int forceDeviceID) +{ +#if defined(HLTCA_STANDALONE) & !defined(_WIN32) + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(0, &mask); + //sched_setaffinity(0, sizeof(mask), &mask); +#endif + + if (sliceCount == -1) sliceCount = fSliceCount; + + if (CheckMemorySizes(sliceCount)) return(1); + +#ifdef R__WIN32 + HANDLE* semLock = new HANDLE; + *semLock = CreateSemaphore(NULL, 1, 1, SemLockName); + if (*semLock == NULL) + { + HLTError("Error creating GPUInit Semaphore"); + return(1); + } + WaitForSingleObject(*semLock, INFINITE); +#else + sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1); + if (semLock == SEM_FAILED) + { + HLTError("Error creating GPUInit Semaphore"); + return(1); + } + timespec semtime; + clock_gettime(CLOCK_REALTIME, &semtime); + semtime.tv_sec += 10; + while (sem_timedwait(semLock, &semtime) != 0) + { + HLTError("Global Lock for GPU initialisation was not released for 10 seconds, assuming another thread died"); + HLTWarning("Resetting the global lock"); + sem_post(semLock); + } +#endif + + fThreadId = GetThread(); + + fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY); + +#ifdef HLTCA_GPU_MERGER + fGPUMergerMaxMemory = 2000000 * 5 * sizeof(float); + fGPUMemSize += fGPUMergerMaxMemory; +#endif + + int retVal = InitGPU_Runtime(sliceCount, forceDeviceID); + ReleaseGlobalLock(semLock); + + if (retVal) + { + HLTImportant("GPU Tracker initialization failed"); + return(1); + } + + fSliceCount = sliceCount; + //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory + fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0); + + for (int i = 0;i < fgkNSlices;i++) + { + fSlaveTrackers[i].SetGPUTracker(); + fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i)); + fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i)); + } + + if (StartHelperThreads()) return(1); + + fHelperMemMutex = malloc(sizeof(pthread_mutex_t)); + if (fHelperMemMutex == NULL) + { + HLTError("Memory allocation error"); + ExitGPU_Runtime(); + return(1); + } + + if (pthread_mutex_init((pthread_mutex_t*) fHelperMemMutex, NULL)) + { + HLTError("Error creating pthread mutex"); + ExitGPU_Runtime(); + free(fHelperMemMutex); + return(1); + } + + fSliceGlobalMutexes = malloc(sizeof(pthread_mutex_t) * fgkNSlices); + if (fSliceGlobalMutexes == NULL) + { + HLTError("Memory allocation error"); + ExitGPU_Runtime(); + return(1); + } + for (int i = 0;i < fgkNSlices;i++) + { + if (pthread_mutex_init(&((pthread_mutex_t*) fSliceGlobalMutexes)[i], NULL)) + { + HLTError("Error creating pthread mutex"); + ExitGPU_Runtime(); + return(1); + } + } + + fCudaInitialized = 1; + HLTImportant("GPU Tracker initialization successfull"); + +#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION) + if (fDebugLevel < 2 && 0) + { + //Do one initial run for Benchmark reasons + const int useDebugLevel = fDebugLevel; + fDebugLevel = 0; + AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount]; + + std::ifstream fin; + + AliHLTTPCCAParam tmpParam; + AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl; + + fin.open("events/settings.dump"); + int tmpCount; + fin >> tmpCount; + for (int i = 0;i < sliceCount;i++) + { + fSlaveTrackers[i].SetOutputControl(&tmpOutputControl); + tmpParam.ReadSettings(fin); + InitializeSliceParam(i, tmpParam); + } + fin.close(); + + fin.open("eventspbpbc/event.0.dump", std::ifstream::binary); + for (int i = 0;i < sliceCount;i++) + { + tmpCluster[i].StartReading(i, 0); + tmpCluster[i].ReadEvent(fin); + } + fin.close(); + + AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount]; + memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*)); + + Reconstruct(tmpOutput, tmpCluster, 0, sliceCount); + for (int i = 0;i < sliceCount;i++) + { + free(tmpOutput[i]); + tmpOutput[i] = NULL; + fSlaveTrackers[i].SetOutputControl(NULL); + } + delete[] tmpOutput; + delete[] tmpCluster; + fDebugLevel = useDebugLevel; + } +#endif + + return(retVal); +} + +int AliHLTTPCCAGPUTrackerBase::ExitGPU() +{ + if (StopHelperThreads()) return(1); + pthread_mutex_destroy((pthread_mutex_t*) fHelperMemMutex); + free(fHelperMemMutex); + + for (int i = 0;i < fgkNSlices;i++) pthread_mutex_destroy(&((pthread_mutex_t*) fSliceGlobalMutexes)[i]); + free(fSliceGlobalMutexes); + + return(ExitGPU_Runtime()); +} + +int AliHLTTPCCAGPUTrackerBase::Reconstruct_Base_FinishSlices(AliHLTTPCCASliceOutput** pOutput, int& iSlice, int& firstSlice) +{ + fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks; + fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits; + if (fUseGlobalTracking) fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracklets = 1; + + if (fDebugLevel >= 3) HLTInfo("Data ready for slice %d, helper thread %d", iSlice, iSlice % (fNHelperThreads + 1)); + fSliceOutputReady = iSlice; + + if (fUseGlobalTracking) + { + if (iSlice % (fgkNSlices / 2) == 2) + { + int tmpId = iSlice % (fgkNSlices / 2) - 1; + if (iSlice >= fgkNSlices / 2) tmpId += fgkNSlices / 2; + GlobalTracking(tmpId, 0, NULL); + fGlobalTrackingDone[tmpId] = 1; + } + for (int tmpSlice3a = 0;tmpSlice3a < iSlice;tmpSlice3a += fNHelperThreads + 1) + { + int tmpSlice3 = tmpSlice3a + 1; + if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2); + if (tmpSlice3 >= iSlice) break; + + int sliceLeft = (tmpSlice3 + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2); + int sliceRight = (tmpSlice3 + 1) % (fgkNSlices / 2); + if (tmpSlice3 >= fgkNSlices / 2) + { + sliceLeft += fgkNSlices / 2; + sliceRight += fgkNSlices / 2; + } + + if (tmpSlice3 % (fgkNSlices / 2) != 1 && fGlobalTrackingDone[tmpSlice3] == 0 && sliceLeft < iSlice && sliceRight < iSlice) + { + GlobalTracking(tmpSlice3, 0, NULL); + fGlobalTrackingDone[tmpSlice3] = 1; + } + + if (fWriteOutputDone[tmpSlice3] == 0 && fSliceLeftGlobalReady[tmpSlice3] && fSliceRightGlobalReady[tmpSlice3]) + { + WriteOutput(pOutput, firstSlice, tmpSlice3, 0); + fWriteOutputDone[tmpSlice3] = 1; + } + } + } + else + { + if (iSlice % (fNHelperThreads + 1) == 0) + { + WriteOutput(pOutput, firstSlice, iSlice, 0); + } + } + return(0); +} + +int AliHLTTPCCAGPUTrackerBase::Reconstruct_Base_Finalize(AliHLTTPCCASliceOutput** pOutput, char*& tmpMemoryGlobalTracking, int& firstSlice) +{ + if (fUseGlobalTracking) + { + for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1) + { + int tmpSlice3 = (tmpSlice3a + 1); + if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2); + if (fGlobalTrackingDone[tmpSlice3] == 0) GlobalTracking(tmpSlice3, 0, NULL); + } + for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1) + { + int tmpSlice3 = (tmpSlice3a + 1); + if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2); + if (fWriteOutputDone[tmpSlice3] == 0) + { + while (fSliceLeftGlobalReady[tmpSlice3] == 0 || fSliceRightGlobalReady[tmpSlice3] == 0); + WriteOutput(pOutput, firstSlice, tmpSlice3, 0); + } + } + } + + for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++) + { + pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]); + } + + if (fUseGlobalTracking) + { + free(tmpMemoryGlobalTracking); + if (fDebugLevel >= 3) + { + for (int iSlice = 0;iSlice < fgkNSlices;iSlice++) + { + HLTDebug("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits); + } + } + } + + StandalonePerfTime(firstSlice, 10); + + if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished"); + return(0); +} + +int AliHLTTPCCAGPUTrackerBase::Reconstruct_Base_StartGlobal(AliHLTTPCCASliceOutput** pOutput, char*& tmpMemoryGlobalTracking) +{ + if (fUseGlobalTracking) + { + int tmpmemSize = sizeof(AliHLTTPCCATracklet) +#ifdef EXTERN_ROW_HITS + + HLTCA_ROW_COUNT * sizeof(int) +#endif + + 16; + tmpMemoryGlobalTracking = (char*) malloc(tmpmemSize * fgkNSlices); + for (int i = 0;i < fgkNSlices;i++) + { + fSliceLeftGlobalReady[i] = 0; + fSliceRightGlobalReady[i] = 0; + } + memset(fGlobalTrackingDone, 0, fgkNSlices); + memset(fWriteOutputDone, 0, fgkNSlices); + + for (int iSlice = 0;iSlice < fgkNSlices;iSlice++) + { + fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(tmpMemoryGlobalTracking + (tmpmemSize * iSlice), 1, fConstructorBlockCount); + } + } + for (int i = 0;i < fNHelperThreads;i++) + { + fHelperParams[i].fPhase = 1; + fHelperParams[i].pOutput = pOutput; + pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]); + } + return(0); +} + +int AliHLTTPCCAGPUTrackerBase::Reconstruct_Base_SliceInit(AliHLTTPCCAClusterData* pClusterData, int& iSlice, int& firstSlice) +{ + StandalonePerfTime(firstSlice + iSlice, 0); + + //Initialize GPU Slave Tracker + if (fDebugLevel >= 3) HLTInfo("Creating Slice Data (Slice %d)", iSlice); + if (iSlice % (fNHelperThreads + 1) == 0) + { + ReadEvent(pClusterData, firstSlice, iSlice, 0); + } + else + { + if (fDebugLevel >= 3) HLTInfo("Waiting for helper thread %d", iSlice % (fNHelperThreads + 1) - 1); + while(fHelperParams[iSlice % (fNHelperThreads + 1) - 1].fDone < iSlice); + } + + if (fDebugLevel >= 4) + { +#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT + *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl; +#endif + if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile); + } + + if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY RANDOM_ERROR) + { + HLTError("Insufficiant Slice Data Memory"); + ResetHelperThreads(1); + return(1); + } + + if (fDebugLevel >= 3) + { + HLTInfo("GPU Slice Data Memory Used: %d/%d", (int) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY); + } + return(0); +} + +int AliHLTTPCCAGPUTrackerBase::Reconstruct_Base_Init(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int& firstSlice, int& sliceCountLocal) +{ + if (sliceCountLocal == -1) sliceCountLocal = fSliceCount; + + if (!fCudaInitialized) + { + HLTError("GPUTracker not initialized"); + return(1); + } + if (sliceCountLocal > fSliceCount) + { + HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal); + return(1); + } + if (fThreadId != GetThread()) + { + HLTWarning("CUDA thread changed, migrating context, Previous Thread: %d, New Thread: %d", fThreadId, GetThread()); + fThreadId = GetThread(); + } + + if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal); + + if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM) + { + HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData)); + return(1); + } + + ActivateThreadContext(); + if (fPPMode) + { + int retVal = ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal); + ReleaseThreadContext(); + return(retVal); + } + + for (int i = fNHelperThreads;i < fNCPUTrackers + fNHelperThreads;i++) + { + fHelperParams[i].CPUTracker = 1; + fHelperParams[i].pClusterData = pClusterData; + fHelperParams[i].pOutput = pOutput; + fHelperParams[i].fSliceCount = sliceCountLocal; + fHelperParams[i].fFirstSlice = firstSlice; + pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]); + } + sliceCountLocal -= fNCPUTrackers * fNSlicesPerCPUTracker; + if (sliceCountLocal < 0) sliceCountLocal = 0; + + fUseGlobalTracking = fGlobalTracking && sliceCountLocal == fgkNSlices; + + memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal); + + if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants"); + +#ifdef HLTCA_GPU_TIME_PROFILE + AliHLTTPCCATracker::StandaloneQueryFreq(&fProfTimeC); + AliHLTTPCCATracker::StandaloneQueryTime(&fProfTimeD); +#endif + + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) + { + //Make this a GPU Tracker + fGpuTracker[iSlice].SetGPUTracker(); + fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice)); + fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice)); + fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false); + fGpuTracker[iSlice].GPUParametersConst()->fGPUMem = (char*) fGPUMemory; + + //Set Pointers to GPU Memory + char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice); + + if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory"); + tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters()); + tmpMem = alignPointer(tmpMem, 1024 * 1024); + + if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory"); + tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount); + tmpMem = alignPointer(tmpMem, 1024 * 1024); + + if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory"); + tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters()); + tmpMem = alignPointer(tmpMem, 1024 * 1024); + + if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY RANDOM_ERROR) + { + HLTError("Insufficiant Track Memory"); + ResetHelperThreads(0); + return(1); + } + + if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY RANDOM_ERROR) + { + HLTError("Insufficiant Global Memory"); + ResetHelperThreads(0); + return(1); + } + + if (fDebugLevel >= 3) + { + HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", (int) (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice)), HLTCA_GPU_GLOBAL_MEMORY, (int) fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY); + } + + //Initialize Startup Constants + *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0; + *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0; + *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0; + fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = sliceCountLocal > fConstructorBlockCount ? (iSlice < fConstructorBlockCount) : fConstructorBlockCount * (iSlice + 1) / sliceCountLocal - fConstructorBlockCount * (iSlice) / sliceCountLocal; + if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount); + fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice; + fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal; + fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0; + fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fNextTracklet = (fConstructorBlockCount / sliceCountLocal + (fConstructorBlockCount % sliceCountLocal > iSlice)) * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR; + fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory()); + } + + for (int i = 0;i < fNHelperThreads;i++) + { + fHelperParams[i].CPUTracker = 0; + fHelperParams[i].fDone = 0; + fHelperParams[i].fPhase = 0; + fHelperParams[i].pClusterData = pClusterData; + fHelperParams[i].fSliceCount = sliceCountLocal; + fHelperParams[i].fFirstSlice = firstSlice; + pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]); + } + + return(0); +} diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.h new file mode 100644 index 00000000000..5c95b0c42d6 --- /dev/null +++ b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerBase.h @@ -0,0 +1,207 @@ +//-*- Mode: C++ -*- +// $Id$ + +// ************************************************************************ +// This file is property of and copyright by the ALICE HLT Project * +// ALICE Experiment at CERN, All rights reserved. * +// See cxx source for full Copyright notice * +// * +//************************************************************************* + +// @file AliHLTTPCCAGPUTrackerBase.h +// @author David Rohr, Sergey Gorbunov +// @date +// @brief TPC CA Tracker for the NVIDIA GPU +// @note + +#ifndef ALIHLTTPCCAGPUTRACKERBASE_H +#define ALIHLTTPCCAGPUTRACKERBASE_H + +#define HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT 36 + +#include "AliHLTTPCCAGPUTracker.h" +#include "AliHLTTPCCADef.h" +#include "AliHLTTPCCATracker.h" +#include "AliHLTLogging.h" +#include "AliHLTTPCCASliceOutput.h" + +#ifdef __CINT__ +typedef int cudaError_t +#elif defined(R__WIN32) +#include "../cmodules/pthread_mutex_win32_wrapper.h" +#else +#include +#include +#endif + +#define RANDOM_ERROR +//#define RANDOM_ERROR || rand() % 500 == 1 + +MEM_CLASS_PRE() class AliHLTTPCCARow; + +class AliHLTTPCCAGPUTrackerBase : public AliHLTTPCCAGPUTracker, public AliHLTLogging +{ + friend void* helperWrapper(void*); +public: + AliHLTTPCCAGPUTrackerBase(); + virtual ~AliHLTTPCCAGPUTrackerBase(); + + virtual int InitGPU(int sliceCount = -1, int forceDeviceID = -1); + virtual int InitGPU_Runtime(int sliceCount = -1, int forceDeviceID = -1) = 0; + virtual int IsInitialized(); + virtual int Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1) = 0; + int SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1); + virtual int ExitGPU(); + virtual int ExitGPU_Runtime() = 0; + + virtual void SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile = NULL); + virtual int SetGPUTrackerOption(char* OptionName, int OptionValue); + + virtual unsigned long long int* PerfTimer(int iSlice, unsigned int i); + + virtual int InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m); + virtual void SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val); + + virtual const AliHLTTPCCASliceOutput::outputControlStruct* OutputControl() const; + virtual int GetSliceCount() const; + + virtual int RefitMergedTracks(AliHLTTPCGMMerger* Merger) = 0; + virtual char* MergerBaseMemory(); + +protected: + virtual void ActivateThreadContext() = 0; + virtual void ReleaseThreadContext() = 0; + virtual void SynchronizeGPU() = 0; + + struct helperParam + { + void* fThreadId; + AliHLTTPCCAGPUTrackerBase* fCls; + int fNum; + int fSliceCount; + AliHLTTPCCAClusterData* pClusterData; + AliHLTTPCCASliceOutput** pOutput; + int fFirstSlice; + void* fMutex; + bool fTerminate; + int fPhase; + int CPUTracker; + volatile int fDone; + volatile bool fReset; + }; + + static void* RowMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + iSlice * sizeof(AliHLTTPCCARow) * (HLTCA_ROW_COUNT + 1) ); } + static void* CommonMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + iSlice * AliHLTTPCCATracker::CommonMemorySize() ); } + static void* SliceDataMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + iSlice * HLTCA_GPU_SLICE_DATA_MEMORY ); } + void* GlobalMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_GLOBAL_MEMORY ); } + void* TracksMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_TRACKS_MEMORY ); } + void* TrackerMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + iSlice * sizeof(AliHLTTPCCATracker) ); } + + int Reconstruct_Base_Init(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int& firstSlice, int& sliceCountLocal); + int Reconstruct_Base_SliceInit(AliHLTTPCCAClusterData* pClusterData, int& iSlice, int& firstSlice); + int Reconstruct_Base_StartGlobal(AliHLTTPCCASliceOutput** pOutput, char*& tmpMemoryGlobalTracking); + int Reconstruct_Base_FinishSlices(AliHLTTPCCASliceOutput** pOutput, int& iSlice, int& firstSlice); + int Reconstruct_Base_Finalize(AliHLTTPCCASliceOutput** pOutput, char*& tmpMemoryGlobalTracking, int& firstSlice); + virtual int ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1) = 0; + + void ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId); + void WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId); + int GlobalTracking(int iSlice, int threadId, helperParam* hParam); + + int StartHelperThreads(); + int StopHelperThreads(); + void ResetHelperThreads(int helpers); + void ResetThisHelperThread(AliHLTTPCCAGPUTrackerBase::helperParam* par); + + int GetThread(); + void ReleaseGlobalLock(void* sem); + int CheckMemorySizes(int sliceCount); + + virtual int GPUSync(char* state = "UNKNOWN", int stream = -1, int slice = 0) = 0; + template T* alignPointer(T* ptr, int alignment); + void StandalonePerfTime(int iSlice, int i); +#define GPUFailedMsg(x) GPUFailedMsgA(x, __FILE__, __LINE__) + + static void* helperWrapper(void*); + + AliHLTTPCCATracker *fGpuTracker; //Tracker Objects that will be used on the GPU + void* fGPUMemory; //Pointer to GPU Memory Base Adress + void* fHostLockedMemory; //Pointer to Base Adress of Page Locked Host Memory for DMA Transfer + + void* fGPUMergerMemory; + void* fGPUMergerHostMemory; + int fGPUMergerMaxMemory; + + int fDebugLevel; //Debug Level for GPU Tracker + unsigned int fDebugMask; //Mask which Debug Data is written to file + std::ostream* fOutFile; //Debug Output Stream Pointer + unsigned long long int fGPUMemSize; //Memory Size to allocate on GPU + + int fSliceCount; //Maximum Number of Slices this GPU tracker can process in parallel + int fCudaDevice; //CUDA device used by GPU tracker + + static const int fgkNSlices = 36; //Number of Slices in Alice + AliHLTTPCCATracker fSlaveTrackers[fgkNSlices]; //CPU Slave Trackers for Initialization and Output + + AliHLTTPCCASliceOutput::outputControlStruct* fOutputControl; //Output Control Structure + + int fThreadId; //Thread ID that is valid for the local CUDA context + int fCudaInitialized; //Flag if CUDA is initialized + + int fPPMode; //Flag if GPU tracker runs in PP Mode + int fSelfheal; //Reinitialize GPU on failure + + int fConstructorBlockCount; //GPU blocks used in Tracklet Constructor + int selectorBlockCount; //GPU blocks used in Tracklet Selector + +#ifdef HLTCA_GPU_TIME_PROFILE + unsigned long long int fProfTimeC, fProfTimeD; //Timing +#endif + + int fNHelperThreads; //Number of helper threads for post/preprocessing + helperParam* fHelperParams; //Control Struct for helper threads + void* fHelperMemMutex; + +#ifdef __ROOT__ +#define volatile +#endif + volatile int fSliceOutputReady; + volatile char fSliceLeftGlobalReady[fgkNSlices]; + volatile char fSliceRightGlobalReady[fgkNSlices]; +#ifdef __ROOT__ +#undef volatile +#endif + void* fSliceGlobalMutexes; + char fGlobalTrackingDone[fgkNSlices]; + char fWriteOutputDone[fgkNSlices]; + + int fNCPUTrackers; //Number of CPU trackers to use + int fNSlicesPerCPUTracker; //Number of slices processed by each CPU tracker + + int fGlobalTracking; //Use Global Tracking + int fUseGlobalTracking; + + int fNSlaveThreads; //Number of slave threads currently active + + // disable copy + AliHLTTPCCAGPUTrackerBase( const AliHLTTPCCAGPUTrackerBase& ); + AliHLTTPCCAGPUTrackerBase &operator=( const AliHLTTPCCAGPUTrackerBase& ); + + ClassDef( AliHLTTPCCAGPUTrackerBase, 0 ) +}; + +template inline T* AliHLTTPCCAGPUTrackerBase::alignPointer(T* ptr, int alignment) +{ + //Macro to align Pointers. + //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker + //(As long as every single data structure is <= 1 MB) + + size_t adr = (size_t) ptr; + if (adr % alignment) + { + adr += alignment - (adr % alignment); + } + return((T*) adr); +} + +#endif diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerCommon.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerCommon.h new file mode 100644 index 00000000000..346783170ab --- /dev/null +++ b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerCommon.h @@ -0,0 +1,28 @@ +//Disable assertions since they produce errors in GPU Code +#ifdef assert +#undef assert +#endif +#define assert(param) + +#ifdef R__WIN32 +#else +#include +#include +#include +#endif +#include "AliHLTTPCCADef.h" +#include "AliHLTTPCCAGPUConfig.h" + +#if defined(HLTCA_STANDALONE) & !defined(_WIN32) +#include +#endif + +#include +#include + +#include "MemoryAssignmentHelpers.h" + +#ifndef HLTCA_STANDALONE +#include "AliHLTDefinitions.h" +#include "AliHLTSystem.h" +#endif diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu index 7b2874c9b6f..0ce1be6db68 100755 --- a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu +++ b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu @@ -17,37 +17,19 @@ // * //*************************************************************************** -#define HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT 36 #define FERMI #include "AliHLTTPCCAGPUTrackerNVCC.h" +#include "AliHLTTPCCAGPUTrackerCommon.h" +#define get_global_id(dim) (blockIdx.x * blockDim.x + threadIdx.x) +#define get_global_size(dim) (blockDim.x * gridDim.x) +#define get_num_groups(dim) (gridDim.x) +#define get_local_id(dim) (threadIdx.x) +#define get_local_size(dim) (blockDim.x) +#define get_group_id(dim) (blockIdx.x) -#ifdef HLTCA_GPUCODE #include #include #include -#endif - -#ifdef R__WIN32 -#else -#include -#include -#include -#endif -#include "AliHLTTPCCADef.h" -#include "AliHLTTPCCAGPUConfig.h" - -#if defined(HLTCA_STANDALONE) & !defined(_WIN32) -#include -#endif - -#include -#include - -//Disable assertions since they produce errors in GPU Code -#ifdef assert -#undef assert -#endif -#define assert(param) __constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )]; #ifdef HLTCA_GPU_TEXTURE_FETCH @@ -80,213 +62,9 @@ texture gAliTexRefs; #include "AliHLTTPCGMTrackParam.cxx" #endif -#include "MemoryAssignmentHelpers.h" - -#ifndef HLTCA_STANDALONE -#include "AliHLTDefinitions.h" -#include "AliHLTSystem.h" -#endif - -#define RANDOM_ERROR -//#define RANDOM_ERROR || rand() % 500 == 1 - ClassImp( AliHLTTPCCAGPUTrackerNVCC ) -int AliHLTTPCCAGPUTrackerNVCC::GlobalTracking(int iSlice, int threadId, AliHLTTPCCAGPUTrackerNVCC::helperParam* hParam) -{ - if (fDebugLevel >= 3) printf("GPU Tracker running Global Tracking for slice %d on thread %d\n", iSlice, threadId); - - int sliceLeft = (iSlice + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2); - int sliceRight = (iSlice + 1) % (fgkNSlices / 2); - if (iSlice >= fgkNSlices / 2) - { - sliceLeft += fgkNSlices / 2; - sliceRight += fgkNSlices / 2; - } - while (fSliceOutputReady < iSlice || fSliceOutputReady < sliceLeft || fSliceOutputReady < sliceRight) - { - if (hParam != NULL && hParam->fReset) return(1); - } - - pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]); - pthread_mutex_lock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]); - fSlaveTrackers[iSlice].PerformGlobalTracking(fSlaveTrackers[sliceLeft], fSlaveTrackers[sliceRight], HLTCA_GPU_MAX_TRACKS); - pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceLeft]); - pthread_mutex_unlock(&((pthread_mutex_t*) fSliceGlobalMutexes)[sliceRight]); - - fSliceLeftGlobalReady[sliceLeft] = 1; - fSliceRightGlobalReady[sliceRight] = 1; - if (fDebugLevel >= 3) printf("GPU Tracker finished Global Tracking for slice %d on thread %d\n", iSlice, threadId); - return(0); -} - -void* AliHLTTPCCAGPUTrackerNVCC::helperWrapper(void* arg) -{ - AliHLTTPCCAGPUTrackerNVCC::helperParam* par = (AliHLTTPCCAGPUTrackerNVCC::helperParam*) arg; - AliHLTTPCCAGPUTrackerNVCC* cls = par->fCls; - - AliHLTTPCCATracker* tmpTracker = new AliHLTTPCCATracker; - -#ifdef HLTCA_STANDALONE - if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d starting", par->fNum); -#endif - -#if defined(HLTCA_STANDALONE) & !defined(_WIN32) - cpu_set_t mask; - CPU_ZERO(&mask); - CPU_SET(par->fNum * 2 + 2, &mask); - //sched_setaffinity(0, sizeof(mask), &mask); -#endif - - while(pthread_mutex_lock(&((pthread_mutex_t*) par->fMutex)[0]) == 0 && par->fTerminate == false) - { - if (par->CPUTracker) - { - for (int i = 0;i < cls->fNSlicesPerCPUTracker;i++) - { - int myISlice = cls->fSliceCount - cls->fNCPUTrackers * cls->fNSlicesPerCPUTracker + (par->fNum - cls->fNHelperThreads) * cls->fNSlicesPerCPUTracker + i; -#ifdef HLTCA_STANDALONE - if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Doing full CPU tracking, Slice %d", par->fNum, myISlice); -#endif - if (myISlice >= 0) - { - tmpTracker->Initialize(cls->fSlaveTrackers[par->fFirstSlice + myISlice].Param()); - tmpTracker->ReadEvent(&par->pClusterData[myISlice]); - tmpTracker->DoTracking(); - tmpTracker->SetOutput(&par->pOutput[myISlice]); - pthread_mutex_lock((pthread_mutex_t*) cls->fHelperMemMutex); - tmpTracker->WriteOutputPrepare(); - pthread_mutex_unlock((pthread_mutex_t*) cls->fHelperMemMutex); - tmpTracker->WriteOutput(); - - /*cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUSliceDataMemory((char*) new uint4[HLTCA_GPU_SLICE_DATA_MEMORY/sizeof(uint4)], (char*) new uint4[HLTCA_GPU_ROWS_MEMORY/sizeof(uint4)]); - cls->fSlaveTrackers[par->fFirstSlice + myISlice].ReadEvent(&par->pClusterData[myISlice]); - cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracklets(HLTCA_GPU_MAX_TRACKLETS); - cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersHits(par->pClusterData[myISlice].NumberOfClusters()); - cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetPointersTracks(HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters()); - cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTrackletsMemory(reinterpret_cast ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, cls->fConstructorBlockCount); - cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerHitsMemory(reinterpret_cast ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemorySize()/sizeof( uint4 ) + 100]), par->pClusterData[myISlice].NumberOfClusters()); - cls->fSlaveTrackers[par->fFirstSlice + myISlice].SetGPUTrackerTracksMemory(reinterpret_cast ( new uint4 [ cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemorySize()/sizeof( uint4 ) + 100]), HLTCA_GPU_MAX_TRACKS, par->pClusterData[myISlice].NumberOfClusters()); - cls->fSlaveTrackers[par->fFirstSlice + myISlice].DoTracking(); - cls->WriteOutput(par->pOutput, par->fFirstSlice, myISlice, par->fNum + 1); - delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].HitMemory(); - delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackletMemory(); - delete[] cls->fSlaveTrackers[par->fFirstSlice + myISlice].TrackMemory();*/ - } -#ifdef HLTCA_STANDALONE - if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d", par->fNum, myISlice); -#endif - } - } - else - { - int mustRunSlice19 = 0; - for (int i = par->fNum + 1;i < par->fSliceCount;i += cls->fNHelperThreads + 1) - { - //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Running, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase); - if (par->fPhase) - { - if (cls->fUseGlobalTracking) - { - int realSlice = i + 1; - if (realSlice % (fgkNSlices / 2) < 1) realSlice -= fgkNSlices / 2; - - if (realSlice % (fgkNSlices / 2) != 1) - { - cls->GlobalTracking(realSlice, par->fNum + 1, par); - } - - if (realSlice == 19) - { - mustRunSlice19 = 1; - } - else - { - while (cls->fSliceLeftGlobalReady[realSlice] == 0 || cls->fSliceRightGlobalReady[realSlice] == 0) - { - if (par->fReset) goto ResetHelperThread; - } - cls->WriteOutput(par->pOutput, par->fFirstSlice, realSlice, par->fNum + 1); - } - } - else - { - while (cls->fSliceOutputReady < i) - { - if (par->fReset) goto ResetHelperThread; - } - cls->WriteOutput(par->pOutput, par->fFirstSlice, i, par->fNum + 1); - } - } - else - { - cls->ReadEvent(par->pClusterData, par->fFirstSlice, i, par->fNum + 1); - par->fDone = i + 1; - } - //if (cls->fDebugLevel >= 3) HLTInfo("\tHelper Thread %d Finished, Slice %d+%d, Phase %d", par->fNum, par->fFirstSlice, i, par->fPhase); - } - if (mustRunSlice19) - { - while (cls->fSliceLeftGlobalReady[19] == 0 || cls->fSliceRightGlobalReady[19] == 0) - { - if (par->fReset) goto ResetHelperThread; - } - cls->WriteOutput(par->pOutput, par->fFirstSlice, 19, par->fNum + 1); - } - } -ResetHelperThread: - cls->ResetThisHelperThread(par); - } -#ifdef HLTCA_STANDALONE - if (cls->fDebugLevel >= 2) HLTInfo("\tHelper thread %d terminating", par->fNum); -#endif - delete tmpTracker; - pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]); - pthread_exit(NULL); - return(NULL); -} - -void AliHLTTPCCAGPUTrackerNVCC::ResetThisHelperThread(AliHLTTPCCAGPUTrackerNVCC::helperParam* par) -{ - if (par->fReset) HLTImportant("GPU Helper Thread %d reseting", par->fNum); - par->fReset = false; - pthread_mutex_unlock(&((pthread_mutex_t*) par->fMutex)[1]); -} - -#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem" - -AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() : -fGpuTracker(NULL), -fGPUMemory(NULL), -fHostLockedMemory(NULL), -fGPUMergerMemory(NULL), -fGPUMergerHostMemory(NULL), -fGPUMergerMaxMemory(0), -fDebugLevel(0), -fDebugMask(0xFFFFFFFF), -fOutFile(NULL), -fGPUMemSize(0), -fpCudaStreams(NULL), -fSliceCount(HLTCA_GPU_DEFAULT_MAX_SLICE_COUNT), -fCudaDevice(0), -fOutputControl(NULL), -fThreadId(0), -fCudaInitialized(0), -fPPMode(0), -fSelfheal(0), -fConstructorBlockCount(30), -selectorBlockCount(30), -fCudaContext(NULL), -fNHelperThreads(HLTCA_GPU_DEFAULT_HELPER_THREADS), -fHelperParams(NULL), -fHelperMemMutex(NULL), -fSliceOutputReady(0), -fSliceGlobalMutexes(NULL), -fNCPUTrackers(0), -fNSlicesPerCPUTracker(0), -fGlobalTracking(0), -fUseGlobalTracking(0), -fNSlaveThreads(0) +AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() : fpCudaStreams(NULL) { fCudaContext = (void*) new CUcontext; }; @@ -296,113 +74,22 @@ AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() delete (CUcontext*) fCudaContext; }; -void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem) -{ - //Release the global named semaphore that locks GPU Initialization -#ifdef R__WIN32 - HANDLE* h = (HANDLE*) sem; - ReleaseSemaphore(*h, 1, NULL); - CloseHandle(*h); - delete h; -#else - sem_t* pSem = (sem_t*) sem; - sem_post(pSem); - sem_unlink(SemLockName); -#endif -} - -int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount) -{ - //Check constants for correct memory sizes - if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY) - { - HLTError("Insufficiant Tracker Object Memory for %d slices", sliceCount); - return(1); - } - - if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY) - { - HLTError("Insufficiant Common Memory"); - return(1); - } - - if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY) - { - HLTError("Insufficiant Row Memory"); - return(1); - } - - if (fDebugLevel >= 3) - { - HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", (int) sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, (int) (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize()), HLTCA_GPU_COMMON_MEMORY, (int) (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow)), HLTCA_GPU_ROWS_MEMORY); - } - return(0); -} - -int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID) +int AliHLTTPCCAGPUTrackerNVCC::InitGPU_Runtime(int sliceCount, int forceDeviceID) { //Find best CUDA device, initialize and allocate memory -#if defined(HLTCA_STANDALONE) & !defined(_WIN32) - cpu_set_t mask; - CPU_ZERO(&mask); - CPU_SET(0, &mask); - //sched_setaffinity(0, sizeof(mask), &mask); -#endif - - if (sliceCount == -1) sliceCount = fSliceCount; - - if (CheckMemorySizes(sliceCount)) return(1); - -#ifdef R__WIN32 - HANDLE* semLock = new HANDLE; - *semLock = CreateSemaphore(NULL, 1, 1, SemLockName); - if (*semLock == NULL) - { - HLTError("Error creating GPUInit Semaphore"); - return(1); - } - WaitForSingleObject(*semLock, INFINITE); -#else - sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1); - if (semLock == SEM_FAILED) - { - HLTError("Error creating GPUInit Semaphore"); - return(1); - } - timespec semtime; - clock_gettime(CLOCK_REALTIME, &semtime); - semtime.tv_sec += 10; - while (sem_timedwait(semLock, &semtime) != 0) - { - HLTError("Global Lock for GPU initialisation was not released for 10 seconds, assuming another thread died"); - HLTWarning("Resetting the global lock"); - sem_post(semLock); - } -#endif - - fThreadId = GetThread(); - cudaDeviceProp fCudaDeviceProp; - fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY); - -#ifdef HLTCA_GPU_MERGER - fGPUMergerMaxMemory = 2000000 * 5 * sizeof(float); - fGPUMemSize += fGPUMergerMaxMemory; -#endif - #ifndef CUDA_DEVICE_EMULATION int count, bestDevice = -1; long long int bestDeviceSpeed = 0, deviceSpeed; - if (CudaFailedMsg(cudaGetDeviceCount(&count))) + if (GPUFailedMsg(cudaGetDeviceCount(&count))) { HLTError("Error getting CUDA Device Count"); - ReleaseGlobalLock(semLock); return(1); } if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:"); -#ifdef FERMI +#if defined(FERMI) || defined(KEPLER) const int reqVerMaj = 2; const int reqVerMin = 0; #else @@ -425,7 +112,7 @@ int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID) if(cuMemGetInfo(&free, &total)) std::cout << "Error\n"; cuCtxDestroy(tmpContext); if (fDebugLevel >= 4) printf("Obtained current memory usage for device %d\n", i); - if (CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue; + if (GPUFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue; if (fDebugLevel >= 4) printf("Obtained device properties for device %d\n", i); int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < reqVerMaj || (fCudaDeviceProp.major == reqVerMaj && fCudaDeviceProp.minor < reqVerMin)) && free >= fGPUMemSize + 100 * 1024 + 1024; #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER @@ -444,7 +131,6 @@ int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID) { HLTWarning("No %sCUDA Device available, aborting CUDA Initialisation", count ? "appropriate " : ""); HLTInfo("Requiring Revision %d.%d, Mem: %lld, Multiprocessors: %d", reqVerMaj, reqVerMin, fGPUMemSize + 100 * 1024 * 1024, sliceCount); - ReleaseGlobalLock(semLock); return(1); } @@ -483,32 +169,28 @@ int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID) if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) { HLTError( "Unsupported CUDA Device" ); - ReleaseGlobalLock(semLock); return(1); } if (cuCtxCreate((CUcontext*) fCudaContext, CU_CTX_SCHED_AUTO, fCudaDevice) != CUDA_SUCCESS) { HLTError("Could not set CUDA Device!"); - ReleaseGlobalLock(semLock); return(1); } - if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize))) + if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || GPUFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize))) { HLTError("CUDA Memory Allocation Error"); cudaThreadExit(); - ReleaseGlobalLock(semLock); return(1); } fGPUMergerMemory = ((char*) fGPUMemory) + fGPUMemSize - fGPUMergerMaxMemory; - ReleaseGlobalLock(semLock); if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize); int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY; #ifdef HLTCA_GPU_MERGER hostMemSize += fGPUMergerMaxMemory; #endif - if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize))) + if (GPUFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize))) { cudaFree(fGPUMemory); cudaThreadExit(); @@ -520,25 +202,14 @@ int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID) if (fDebugLevel >= 1) { - CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize)); - } - - fSliceCount = sliceCount; - //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory - fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0); - - for (int i = 0;i < fgkNSlices;i++) - { - fSlaveTrackers[i].SetGPUTracker(); - fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i)); - fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i)); + GPUFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize)); } fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t)); cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams; for (int i = 0;i < CAMath::Max(3, fSliceCount);i++) { - if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i]))) + if (GPUFailedMsg(cudaStreamCreate(&cudaStreams[i]))) { cudaFree(fGPUMemory); cudaFreeHost(fHostLockedMemory); @@ -548,175 +219,13 @@ int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID) } } - if (StartHelperThreads()) return(1); - - fHelperMemMutex = malloc(sizeof(pthread_mutex_t)); - if (fHelperMemMutex == NULL) - { - HLTError("Memory allocation error"); - cudaFree(fGPUMemory); - cudaFreeHost(fHostLockedMemory); - cudaThreadExit(); - return(1); - } - - if (pthread_mutex_init((pthread_mutex_t*) fHelperMemMutex, NULL)) - { - HLTError("Error creating pthread mutex"); - cudaFree(fGPUMemory); - cudaFreeHost(fHostLockedMemory); - cudaThreadExit(); - return(1); - } - - fSliceGlobalMutexes = malloc(sizeof(pthread_mutex_t) * fgkNSlices); - if (fSliceGlobalMutexes == NULL) - { - HLTError("Memory allocation error"); - cudaFree(fGPUMemory); - cudaFreeHost(fHostLockedMemory); - cudaThreadExit(); - return(1); - } - for (int i = 0;i < fgkNSlices;i++) - { - if (pthread_mutex_init(&((pthread_mutex_t*) fSliceGlobalMutexes)[i], NULL)) - { - HLTError("Error creating pthread mutex"); - cudaFree(fGPUMemory); - cudaFreeHost(fHostLockedMemory); - cudaThreadExit(); - return(1); - } - } - cuCtxPopCurrent((CUcontext*) fCudaContext); - fCudaInitialized = 1; HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", fCudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount); -#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION) - if (fDebugLevel < 2 && 0) - { - //Do one initial run for Benchmark reasons - const int useDebugLevel = fDebugLevel; - fDebugLevel = 0; - AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount]; - - std::ifstream fin; - - AliHLTTPCCAParam tmpParam; - AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl; - - fin.open("events/settings.dump"); - int tmpCount; - fin >> tmpCount; - for (int i = 0;i < sliceCount;i++) - { - fSlaveTrackers[i].SetOutputControl(&tmpOutputControl); - tmpParam.ReadSettings(fin); - InitializeSliceParam(i, tmpParam); - } - fin.close(); - - fin.open("eventspbpbc/event.0.dump", std::ifstream::binary); - for (int i = 0;i < sliceCount;i++) - { - tmpCluster[i].StartReading(i, 0); - tmpCluster[i].ReadEvent(fin); - } - fin.close(); - - AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount]; - memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*)); - - Reconstruct(tmpOutput, tmpCluster, 0, sliceCount); - for (int i = 0;i < sliceCount;i++) - { - free(tmpOutput[i]); - tmpOutput[i] = NULL; - fSlaveTrackers[i].SetOutputControl(NULL); - } - delete[] tmpOutput; - delete[] tmpCluster; - fDebugLevel = useDebugLevel; - } -#endif - return(0); } -int AliHLTTPCCAGPUTrackerNVCC::StartHelperThreads() -{ - int nThreads = fNHelperThreads + fNCPUTrackers; - if (nThreads) - { - fHelperParams = new helperParam[nThreads]; - if (fHelperParams == NULL) - { - HLTError("Memory allocation error"); - cudaFree(fGPUMemory); - cudaFreeHost(fHostLockedMemory); - cudaThreadExit(); - return(1); - } - for (int i = 0;i < nThreads;i++) - { - fHelperParams[i].fCls = this; - fHelperParams[i].fTerminate = false; - fHelperParams[i].fReset = false; - fHelperParams[i].fNum = i; - fHelperParams[i].fMutex = malloc(2 * sizeof(pthread_mutex_t)); - if (fHelperParams[i].fMutex == NULL) - { - HLTError("Memory allocation error"); - cudaFree(fGPUMemory); - cudaFreeHost(fHostLockedMemory); - cudaThreadExit(); - return(1); - } - for (int j = 0;j < 2;j++) - { - if (pthread_mutex_init(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j], NULL)) - { - HLTError("Error creating pthread mutex"); - cudaFree(fGPUMemory); - cudaFreeHost(fHostLockedMemory); - cudaThreadExit(); - return(1); - } - - pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]); - } - fHelperParams[i].fThreadId = (void*) malloc(sizeof(pthread_t)); - - if (pthread_create((pthread_t*) fHelperParams[i].fThreadId, NULL, helperWrapper, &fHelperParams[i])) - { - HLTError("Error starting slave thread"); - cudaFree(fGPUMemory); - cudaFreeHost(fHostLockedMemory); - cudaThreadExit(); - } - } - } - fNSlaveThreads = nThreads; - return(0); -} - -template inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment) -{ - //Macro to align Pointers. - //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker - //(As long as every single data structure is <= 1 MB) - - size_t adr = (size_t) ptr; - if (adr % alignment) - { - adr += alignment - (adr % alignment); - } - return((T*) adr); -} - -bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsgA(cudaError_t error, const char* file, int line) +bool AliHLTTPCCAGPUTrackerNVCC::GPUFailedMsgA(cudaError_t error, const char* file, int line) { //Check for CUDA Error and in the case of an error display the corresponding error string if (error == cudaSuccess) return(false); @@ -724,7 +233,7 @@ bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsgA(cudaError_t error, const char* fi return(true); } -int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice) +int AliHLTTPCCAGPUTrackerNVCC::GPUSync(char* state, int stream, int slice) { //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards @@ -733,81 +242,18 @@ int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice) cuErr = cudaGetLastError(); if (cuErr != cudaSuccess) { - HLTError("Cuda Error %s while running kernel (%s) (Slice %d; %d/%d)", cudaGetErrorString(cuErr), state, sliceLocal, slice, fgkNSlices); + HLTError("Cuda Error %s while running kernel (%s) (Stream %d; %d/%d)", cudaGetErrorString(cuErr), state, stream, slice, fgkNSlices); return(1); } - if (CudaFailedMsg(cudaThreadSynchronize())) + if (GPUFailedMsg(cudaThreadSynchronize())) { - HLTError("CUDA Error while synchronizing (%s) (Slice %d; %d/%d)", state, sliceLocal, slice, fgkNSlices); + HLTError("CUDA Error while synchronizing (%s) (Stream %d; %d/%d)", state, stream, slice, fgkNSlices); return(1); } if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done"); return(0); } -void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile) -{ - //Set Debug Level and Debug output File if applicable - fDebugLevel = dwLevel; - if (NewOutFile) fOutFile = NewOutFile; -} - -int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int OptionValue) -{ - //Set a specific GPU Tracker Option - if (strcmp(OptionName, "PPMode") == 0) - { - fPPMode = OptionValue; - } - else if (strcmp(OptionName, "DebugMask") == 0) - { - fDebugMask = OptionValue; - } - else if (strcmp(OptionName, "HelperThreads") == 0) - { - fNHelperThreads = OptionValue; - } - else if (strcmp(OptionName, "CPUTrackers") == 0) - { - fNCPUTrackers = OptionValue; - } - else if (strcmp(OptionName, "SlicesPerCPUTracker") == 0) - { - fNSlicesPerCPUTracker = OptionValue; - } - else if (strcmp(OptionName, "GlobalTracking") == 0) - { - fGlobalTracking = OptionValue; - } - else - { - HLTError("Unknown Option: %s", OptionName); - return(1); - } - - if (fNHelperThreads + fNCPUTrackers > fNSlaveThreads && fCudaInitialized) - { - HLTInfo("Insufficient Slave Threads available (%d), creating additional Slave Threads (%d+%d)\n", fNSlaveThreads, fNHelperThreads, fNCPUTrackers); - StopHelperThreads(); - StartHelperThreads(); - } - - return(0); -} - -#ifdef HLTCA_STANDALONE -void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i) -{ - //Run Performance Query for timer i of slice iSlice - if (fDebugLevel >= 1) - { - AliHLTTPCCATracker::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i)); - } -} -#else -void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {} -#endif - #if defined(BITWISE_COMPATIBLE_DEBUG_OUTPUT) || defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER) void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker*, int, bool) {} #else @@ -823,289 +269,104 @@ void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int i int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2); int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2); uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * fConstructorBlockCount); - CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost)); - CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost)); - CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * fConstructorBlockCount, cudaMemcpyDeviceToHost)); - CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * fConstructorBlockCount, cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet; for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++) { - *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << - rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: "; - for (int j = 0;j < rowBlockPos[i].x;j++) - { - //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem! - *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", "; -#ifdef HLTCA_GPU_SCHED_FIXED_START - if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k) - { - HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k); - } -#endif //HLTCA_GPU_SCHED_FIXED_START - k++; - if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1) - { - HLTError("Error, -1 Tracklet found"); - } - } - *fOutFile << std::endl << "Phase 2: "; - for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++) - { - *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", "; - } - *fOutFile << std::endl; - } - - if (check) - { - *fOutFile << "Starting Threads: (Slice" << tracker[iSlice].Param().ISlice() << ", First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl; - for (int i = 0;i < fConstructorBlockCount;i++) - { - *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl; - } - } - - free(rowBlockPos); - free(rowBlockTracklets); - free(blockStartingTracklet); - } -} -#endif - -__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits) -{ - //Initialize GPU RowBlocks and HitWeights - int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights; - const int stride = blockDim.x * gridDim.x; - int4 i0; - i0.x = i0.y = i0.z = i0.w = 0; -#ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER - int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets; - int4 i1; - i1.x = i1.y = i1.z = i1.w = -1; - for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride) - RowBlockPos[i] = i0; - for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride) - rowBlockTracklets4[i] = i1; -#endif - for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride) - sliceDataHitWeights4[i] = i0; -} - -int AliHLTTPCCAGPUTrackerNVCC::SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal) -{ - if (!fSelfheal) - { - cuCtxPopCurrent((CUcontext*) fCudaContext); - return(1); - } - static bool selfHealing = false; - if (selfHealing) - { - HLTError("Selfhealing failed, giving up"); - cuCtxPopCurrent((CUcontext*) fCudaContext); - return(1); - } - else - { - HLTError("Unsolvable CUDA error occured, trying to reinitialize GPU"); - } - selfHealing = true; - ExitGPU(); - if (InitGPU(fSliceCount, fCudaDevice)) - { - HLTError("Could not reinitialize CUDA device, disabling GPU tracker"); - ExitGPU(); - return(1); - } - HLTInfo("GPU tracker successfully reinitialized, restarting tracking"); - int retVal = Reconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal); - selfHealing = false; - return(retVal); -} - -void AliHLTTPCCAGPUTrackerNVCC::ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId) -{ - fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice)); -#ifdef HLTCA_GPU_TIME_PROFILE - unsigned long long int a, b; - AliHLTTPCCATracker::StandaloneQueryTime(&a); -#endif - fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]); -#ifdef HLTCA_GPU_TIME_PROFILE - AliHLTTPCCATracker::StandaloneQueryTime(&b); - printf("Read %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC); -#endif -} - -void AliHLTTPCCAGPUTrackerNVCC::WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId) -{ - if (fDebugLevel >= 3) printf("GPU Tracker running WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId); - fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]); -#ifdef HLTCA_GPU_TIME_PROFILE - unsigned long long int a, b; - AliHLTTPCCATracker::StandaloneQueryTime(&a); -#endif - if (fNHelperThreads) pthread_mutex_lock((pthread_mutex_t*) fHelperMemMutex); - fSlaveTrackers[firstSlice + iSlice].WriteOutputPrepare(); - if (fNHelperThreads) pthread_mutex_unlock((pthread_mutex_t*) fHelperMemMutex); - fSlaveTrackers[firstSlice + iSlice].WriteOutput(); -#ifdef HLTCA_GPU_TIME_PROFILE - AliHLTTPCCATracker::StandaloneQueryTime(&b); - printf("Write %d %f %f\n", threadId, ((double) b - (double) a) / (double) fProfTimeC, ((double) a - (double) fProfTimeD) / (double) fProfTimeC); -#endif - if (fDebugLevel >= 3) printf("GPU Tracker finished WriteOutput for slice %d on thread %d\n", firstSlice + iSlice, threadId); -} - -int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal) -{ - //Primary reconstruction function - - cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams; - - if (sliceCountLocal == -1) sliceCountLocal = fSliceCount; - - if (!fCudaInitialized) - { - HLTError("GPUTracker not initialized"); - return(1); - } - if (sliceCountLocal > fSliceCount) - { - HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal); - return(1); - } - if (fThreadId != GetThread()) - { - HLTWarning("CUDA thread changed, migrating context, Previous Thread: %d, New Thread: %d", fThreadId, GetThread()); - fThreadId = GetThread(); - } - - if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal); - - if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM) - { - HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData)); - return(1); - } - - cuCtxPushCurrent(*((CUcontext*) fCudaContext)); - if (fPPMode) - { - int retVal = ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal); - cuCtxPopCurrent((CUcontext*) fCudaContext); - return(retVal); - } - - for (int i = fNHelperThreads;i < fNCPUTrackers + fNHelperThreads;i++) - { - fHelperParams[i].CPUTracker = 1; - fHelperParams[i].pClusterData = pClusterData; - fHelperParams[i].pOutput = pOutput; - fHelperParams[i].fSliceCount = sliceCountLocal; - fHelperParams[i].fFirstSlice = firstSlice; - pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]); - } - sliceCountLocal -= fNCPUTrackers * fNSlicesPerCPUTracker; - if (sliceCountLocal < 0) sliceCountLocal = 0; - - fUseGlobalTracking = fGlobalTracking && sliceCountLocal == fgkNSlices; - - memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal); - - if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants"); - -#ifdef HLTCA_GPU_TIME_PROFILE - AliHLTTPCCATracker::StandaloneQueryFreq(&fProfTimeC); - AliHLTTPCCATracker::StandaloneQueryTime(&fProfTimeD); -#endif - - for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) - { - //Make this a GPU Tracker - fGpuTracker[iSlice].SetGPUTracker(); - fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice)); - fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice)); - fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false); - - //Set Pointers to GPU Memory - char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice); - - if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory"); - tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters()); - tmpMem = alignPointer(tmpMem, 1024 * 1024); - - if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory"); - tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount); - tmpMem = alignPointer(tmpMem, 1024 * 1024); - - if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory"); - tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters()); - tmpMem = alignPointer(tmpMem, 1024 * 1024); - - if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY RANDOM_ERROR) - { - HLTError("Insufficiant Track Memory"); - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); - ResetHelperThreads(0); - return(1); - } - - if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY RANDOM_ERROR) - { - HLTError("Insufficiant Global Memory"); - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); - ResetHelperThreads(0); - return(1); + *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << + rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: "; + for (int j = 0;j < rowBlockPos[i].x;j++) + { + //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem! + *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", "; +#ifdef HLTCA_GPU_SCHED_FIXED_START + if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k) + { + HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k); + } +#endif //HLTCA_GPU_SCHED_FIXED_START + k++; + if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1) + { + HLTError("Error, -1 Tracklet found"); + } + } + *fOutFile << std::endl << "Phase 2: "; + for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++) + { + *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", "; + } + *fOutFile << std::endl; } - if (fDebugLevel >= 3) + if (check) { - HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", (int) (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice)), HLTCA_GPU_GLOBAL_MEMORY, (int) fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY); + *fOutFile << "Starting Threads: (Slice" << tracker[iSlice].Param().ISlice() << ", First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl; + for (int i = 0;i < fConstructorBlockCount;i++) + { + *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl; + } } - //Initialize Startup Constants - *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0; - *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0; - *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0; - fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = sliceCountLocal > fConstructorBlockCount ? (iSlice < fConstructorBlockCount) : fConstructorBlockCount * (iSlice + 1) / sliceCountLocal - fConstructorBlockCount * (iSlice) / sliceCountLocal; - if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount); - fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice; - fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal; - fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0; - fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fNextTracklet = (fConstructorBlockCount / sliceCountLocal + (fConstructorBlockCount % sliceCountLocal > iSlice)) * HLTCA_GPU_THREAD_COUNT; - fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory()); + free(rowBlockPos); + free(rowBlockTracklets); + free(blockStartingTracklet); } +} +#endif + +__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits) +{ + //Initialize GPU RowBlocks and HitWeights + int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights; + const int stride = get_global_size(0); + int4 i0; + i0.x = i0.y = i0.z = i0.w = 0; +#ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER + int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets; + int4 i1; + i1.x = i1.y = i1.z = i1.w = -1; + for (int i = get_global_id(0);i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride) + RowBlockPos[i] = i0; + for (int i = get_global_id(0);i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride) + rowBlockTracklets4[i] = i1; +#endif + for (int i = get_global_id(0);i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride) + sliceDataHitWeights4[i] = i0; +} + +int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal) +{ + //Primary reconstruction function + + cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams; + + if (Reconstruct_Base_Init(pOutput, pClusterData, firstSlice, sliceCountLocal)) return(1); #ifdef HLTCA_GPU_TEXTURE_FETCH cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc(); size_t offset; - if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR) + if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR) { HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset); - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(0); return(1); } cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc(); - if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR) + if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR) { HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset); - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(0); return(1); } cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc(); - if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR) + if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset RANDOM_ERROR) { HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset); - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(0); return(1); } @@ -1115,90 +376,39 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU"); #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE char* tmpMem; - if (CudaFailedMsg(cudaMalloc(&tmpMem, 100000000))) + if (GPUFailedMsg(cudaMalloc(&tmpMem, 100000000))) { HLTError("Error allocating CUDA profile memory"); - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(0); return(1); } fGpuTracker[0].fStageAtSync = tmpMem; - CudaFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000)); + GPUFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000)); #endif - CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0])); - if (CUDASync("Initialization (1)", 0, firstSlice) RANDOM_ERROR) + GPUFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0])); + if (GPUSync("Initialization (1)", 0, firstSlice) RANDOM_ERROR) { - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(0); return(1); } - for (int i = 0;i < fNHelperThreads;i++) - { - fHelperParams[i].CPUTracker = 0; - fHelperParams[i].fDone = 0; - fHelperParams[i].fPhase = 0; - fHelperParams[i].pClusterData = pClusterData; - fHelperParams[i].fSliceCount = sliceCountLocal; - fHelperParams[i].fFirstSlice = firstSlice; - pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]); - } - for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) { - StandalonePerfTime(firstSlice + iSlice, 0); - - //Initialize GPU Slave Tracker - if (fDebugLevel >= 3) HLTInfo("Creating Slice Data (Slice %d)", iSlice); - if (iSlice % (fNHelperThreads + 1) == 0) - { - ReadEvent(pClusterData, firstSlice, iSlice, 0); - } - else - { - if (fDebugLevel >= 3) HLTInfo("Waiting for helper thread %d", iSlice % (fNHelperThreads + 1) - 1); - while(fHelperParams[iSlice % (fNHelperThreads + 1) - 1].fDone < iSlice); - } - - if (fDebugLevel >= 4) - { -#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT - *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl; -#endif - if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile); - } - - if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY RANDOM_ERROR) - { - HLTError("Insufficiant Slice Data Memory"); - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); - ResetHelperThreads(1); - return(1); - } - - if (fDebugLevel >= 3) - { - HLTInfo("GPU Slice Data Memory Used: %d/%d", (int) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY); - } + if (Reconstruct_Base_SliceInit(pClusterData, iSlice, firstSlice)) return(1); //Initialize temporary memory where needed if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory"); PreInitRowBlocks<<>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign()); - if (CUDASync("Initialization (2)", iSlice, iSlice + firstSlice) RANDOM_ERROR) + if (GPUSync("Initialization (2)", 2, iSlice + firstSlice) RANDOM_ERROR) { - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(1); return(1); } //Copy Data to GPU Global Memory - CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); - CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); - CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); + GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); + GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); + GPUFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1])); if (fDebugLevel >= 4) { @@ -1207,10 +417,8 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() ); } - if (CUDASync("Initialization (3)", iSlice, iSlice + firstSlice) RANDOM_ERROR) + if (GPUSync("Initialization (3)", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) { - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(1); return(1); } @@ -1219,10 +427,8 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal); AliHLTTPCCAProcess <<>>(iSlice); - if (CUDASync("Neighbours finder", iSlice, iSlice + firstSlice) RANDOM_ERROR) + if (GPUSync("Neighbours finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) { - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(1); return(1); } @@ -1231,16 +437,14 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali if (fDebugLevel >= 4) { - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost)); if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile); } if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal); AliHLTTPCCAProcess <<>>(iSlice); - if (CUDASync("Neighbours Cleaner", iSlice, iSlice + firstSlice) RANDOM_ERROR) + if (GPUSync("Neighbours Cleaner", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) { - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(1); return(1); } @@ -1249,16 +453,14 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali if (fDebugLevel >= 4) { - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost)); if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile); } if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal); AliHLTTPCCAProcess <<>>(iSlice); - if (CUDASync("Start Hits Finder", iSlice, iSlice + firstSlice) RANDOM_ERROR) + if (GPUSync("Start Hits Finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) { - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(1); return(1); } @@ -1267,10 +469,8 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal); AliHLTTPCCAProcess <<>>(iSlice); - if (CUDASync("Start Hits Sorter", iSlice, iSlice + firstSlice) RANDOM_ERROR) + if (GPUSync("Start Hits Sorter", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) { - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(1); return(1); } @@ -1279,13 +479,11 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali if (fDebugLevel >= 2) { - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice); if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS RANDOM_ERROR) { HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant"); - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(1); return(1); } @@ -1294,14 +492,14 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali if (fDebugLevel >= 4 && *fSlaveTrackers[firstSlice + iSlice].NTracklets()) { #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost)); if (fDebugMask & 8) { *fOutFile << "Temporary "; fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile); } uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows()); - CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost)); if (fDebugMask & 16) { *fOutFile << "Start Hits Sort Vector:" << std::endl; @@ -1313,7 +511,7 @@ int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, Ali free(tmpMemory); #endif - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost)); if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile); } @@ -1337,7 +535,7 @@ RestartTrackletConstructor: for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) { AliHLTTPCCATrackletConstructorInit<<>>(iSlice); - if (CUDASync("Tracklet Initializer", iSlice, iSlice + firstSlice) RANDOM_ERROR) + if (GPUSync("Tracklet Initializer", -1, iSlice + firstSlice) RANDOM_ERROR) { cudaThreadSynchronize(); cuCtxPopCurrent((CUcontext*) fCudaContext); @@ -1349,7 +547,7 @@ RestartTrackletConstructor: if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor"); AliHLTTPCCATrackletConstructorGPU<<>>(); - if (CUDASync("Tracklet Constructor", 0, firstSlice) RANDOM_ERROR) + if (GPUSync("Tracklet Constructor", -1, firstSlice) RANDOM_ERROR) { cudaThreadSynchronize(); cuCtxPopCurrent((CUcontext*) fCudaContext); @@ -1363,13 +561,13 @@ RestartTrackletConstructor: for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) { if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false); - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost)); if (fDebugLevel >= 5) { HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets()); } - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost)); - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost)); if (0 && fSlaveTrackers[firstSlice + iSlice].NTracklets() && fSlaveTrackers[firstSlice + iSlice].Tracklet(0).NHits() < 0) { cudaThreadSynchronize(); @@ -1387,7 +585,7 @@ RestartTrackletConstructor: if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++; if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices); AliHLTTPCCAProcessMulti<<>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice)); - if (CUDASync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR) + if (GPUSync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR) { cudaThreadSynchronize(); cuCtxPopCurrent((CUcontext*) fCudaContext); @@ -1398,33 +596,8 @@ RestartTrackletConstructor: char *tmpMemoryGlobalTracking = NULL; fSliceOutputReady = 0; - if (fUseGlobalTracking) - { - int tmpmemSize = sizeof(AliHLTTPCCATracklet) -#ifdef EXTERN_ROW_HITS - + HLTCA_ROW_COUNT * sizeof(int) -#endif - + 16; - tmpMemoryGlobalTracking = (char*) malloc(tmpmemSize * fgkNSlices); - for (int i = 0;i < fgkNSlices;i++) - { - fSliceLeftGlobalReady[i] = 0; - fSliceRightGlobalReady[i] = 0; - } - memset(fGlobalTrackingDone, 0, fgkNSlices); - memset(fWriteOutputDone, 0, fgkNSlices); - - for (int iSlice = 0;iSlice < fgkNSlices;iSlice++) - { - fSlaveTrackers[iSlice].SetGPUTrackerTrackletsMemory(tmpMemoryGlobalTracking + (tmpmemSize * iSlice), 1, fConstructorBlockCount); - } - } - for (int i = 0;i < fNHelperThreads;i++) - { - fHelperParams[i].fPhase = 1; - fHelperParams[i].pOutput = pOutput; - pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0]); - } + + if (Reconstruct_Base_StartGlobal(pOutput, tmpMemoryGlobalTracking)) return(1); int tmpSlice = 0, tmpSlice2 = 0; for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) @@ -1433,10 +606,10 @@ RestartTrackletConstructor: while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == (cudaError_t) CUDA_SUCCESS)) { - if (CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice])) RANDOM_ERROR) + if (GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice])) RANDOM_ERROR) { ResetHelperThreads(1); - cudaThreadSynchronize(); + ActivateThreadContext(); return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal)); } tmpSlice++; @@ -1444,21 +617,24 @@ RestartTrackletConstructor: while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == (cudaError_t) CUDA_SUCCESS) { - CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2])); - CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2])); + if (*fSlaveTrackers[firstSlice + tmpSlice2].NTracks() > 0) + { + GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2])); + GPUFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2])); + } tmpSlice2++; } - if (CudaFailedMsg(cudaStreamSynchronize(cudaStreams[iSlice])) RANDOM_ERROR) + if (GPUFailedMsg(cudaStreamSynchronize(cudaStreams[iSlice])) RANDOM_ERROR) { ResetHelperThreads(1); - cudaThreadSynchronize(); + ActivateThreadContext(); return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal)); } if (fDebugLevel >= 4) { - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost)); #ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile); #endif @@ -1481,81 +657,28 @@ RestartTrackletConstructor: if (fDebugLevel >= 4) { ResetHelperThreads(1); - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); return(1); } for (int i = 0;i < sliceCountLocal;i++) { cudaThreadSynchronize(); - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost)); *fSlaveTrackers[firstSlice + i].NTracks() = 0; *fSlaveTrackers[firstSlice + i].NTrackHits() = 0; fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE; - CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice)); PreInitRowBlocks<<>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign()); } goto RestartTrackletConstructor; } #endif HLTError("GPU Tracker returned Error Code %d in slice %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError, firstSlice + iSlice); - cudaThreadSynchronize(); - cuCtxPopCurrent((CUcontext*) fCudaContext); ResetHelperThreads(1); return(1); } if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits()); - fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTracks = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracks; - fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNLocalTrackHits = fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTrackHits; - if (fUseGlobalTracking) fSlaveTrackers[firstSlice + iSlice].CommonMemory()->fNTracklets = 1; - - if (fDebugLevel >= 3) HLTInfo("Data ready for slice %d, helper thread %d", iSlice, iSlice % (fNHelperThreads + 1)); - fSliceOutputReady = iSlice; - - if (fUseGlobalTracking) - { - if (iSlice % (fgkNSlices / 2) == 2) - { - int tmpId = iSlice % (fgkNSlices / 2) - 1; - if (iSlice >= fgkNSlices / 2) tmpId += fgkNSlices / 2; - GlobalTracking(tmpId, 0, NULL); - fGlobalTrackingDone[tmpId] = 1; - } - for (int tmpSlice3a = 0;tmpSlice3a < iSlice;tmpSlice3a += fNHelperThreads + 1) - { - int tmpSlice3 = tmpSlice3a + 1; - if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2); - if (tmpSlice3 >= iSlice) break; - - int sliceLeft = (tmpSlice3 + (fgkNSlices / 2 - 1)) % (fgkNSlices / 2); - int sliceRight = (tmpSlice3 + 1) % (fgkNSlices / 2); - if (tmpSlice3 >= fgkNSlices / 2) - { - sliceLeft += fgkNSlices / 2; - sliceRight += fgkNSlices / 2; - } - - if (tmpSlice3 % (fgkNSlices / 2) != 1 && fGlobalTrackingDone[tmpSlice3] == 0 && sliceLeft < iSlice && sliceRight < iSlice) - { - GlobalTracking(tmpSlice3, 0, NULL); - fGlobalTrackingDone[tmpSlice3] = 1; - } - - if (fWriteOutputDone[tmpSlice3] == 0 && fSliceLeftGlobalReady[tmpSlice3] && fSliceRightGlobalReady[tmpSlice3]) - { - WriteOutput(pOutput, firstSlice, tmpSlice3, 0); - fWriteOutputDone[tmpSlice3] = 1; - } - } - } - else - { - if (iSlice % (fNHelperThreads + 1) == 0) - { - WriteOutput(pOutput, firstSlice, iSlice, 0); - } - } + if (Reconstruct_Base_FinishSlices(pOutput, iSlice, firstSlice)) return(1); if (fDebugLevel >= 4) { @@ -1564,46 +687,7 @@ RestartTrackletConstructor: } } - if (fUseGlobalTracking) - { - for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1) - { - int tmpSlice3 = (tmpSlice3a + 1); - if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2); - if (fGlobalTrackingDone[tmpSlice3] == 0) GlobalTracking(tmpSlice3, 0, NULL); - } - for (int tmpSlice3a = 0;tmpSlice3a < fgkNSlices;tmpSlice3a += fNHelperThreads + 1) - { - int tmpSlice3 = (tmpSlice3a + 1); - if (tmpSlice3 % (fgkNSlices / 2) < 1) tmpSlice3 -= (fgkNSlices / 2); - if (fWriteOutputDone[tmpSlice3] == 0) - { - while (fSliceLeftGlobalReady[tmpSlice3] == 0 || fSliceRightGlobalReady[tmpSlice3] == 0); - WriteOutput(pOutput, firstSlice, tmpSlice3, 0); - } - } - } - - for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++) - { - pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]); - } - - if (fUseGlobalTracking) - { - free(tmpMemoryGlobalTracking); - if (fDebugLevel >= 3) - { - for (int iSlice = 0;iSlice < fgkNSlices;iSlice++) - { - printf("Slice %d - Tracks: Local %d Global %d - Hits: Local %d Global %d\n", iSlice, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTracks, fSlaveTrackers[iSlice].CommonMemory()->fNTracks, fSlaveTrackers[iSlice].CommonMemory()->fNLocalTrackHits, fSlaveTrackers[iSlice].CommonMemory()->fNTrackHits); - } - } - } - - StandalonePerfTime(firstSlice, 10); - - if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished"); + if (Reconstruct_Base_Finalize(pOutput, tmpMemoryGlobalTracking, firstSlice)) return(1); /*for (int i = firstSlice;i < firstSlice + sliceCountLocal;i++) { @@ -1623,7 +707,7 @@ RestartTrackletConstructor: #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE char* stageAtSync = (char*) malloc(100000000); - CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost)); cudaFree(fGpuTracker[0].StageAtSync()); FILE* fp = fopen("profile.txt", "w+"); @@ -1692,11 +776,11 @@ __global__ void ClearPPHitWeights(int sliceCount) AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k]; int4* const pHitWeights = (int4*) tracker.Data().HitWeights(); const int dwCount = tracker.Data().NumberOfHitsPlusAlign(); - const int stride = blockDim.x * gridDim.x; + const int stride = get_global_size(0); int4 i0; i0.x = i0.y = i0.z = i0.w = 0; - for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < dwCount * sizeof(int) / sizeof(int4);i += stride) + for (int i = get_global_id(0);i < dwCount * sizeof(int) / sizeof(int4);i += stride) { pHitWeights[i] = i0; } @@ -1732,7 +816,6 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A fGpuTracker[iSlice].SetGPUTracker(); fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice)); - fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice)); fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false); @@ -1741,7 +824,6 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(); tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024); - //Set Pointers to GPU Memory char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice); @@ -1777,26 +859,26 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory()); - if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1); + if (GPUSync("Initialization", -1, iSlice + firstSlice)) return(1); StandalonePerfTime(firstSlice + iSlice, 1); } #ifdef HLTCA_GPU_TEXTURE_FETCH cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc(); size_t offset; - if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) + if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) { HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset); return(1); } cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc(); - if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) + if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) { HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset); return(1); } cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc(); - if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) + if (GPUFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset) { HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset); return(1); @@ -1805,34 +887,34 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A //Copy Tracker Object to GPU Memory if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU"); - CudaFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice)); //Copy Data to GPU Global Memory for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) { - CudaFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice)); //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize()); } - //CudaFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice)); + //GPUFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice)); //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0))); - CudaFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice)); - CudaFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice)); if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder"); AliHLTTPCCAProcessMultiA <<>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows()); - if (CUDASync("Neighbours finder", 0, firstSlice)) return 1; + if (GPUSync("Neighbours finder", -1, firstSlice)) return 1; StandalonePerfTime(firstSlice, 2); if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner"); AliHLTTPCCAProcessMultiA <<>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2); - if (CUDASync("Neighbours Cleaner", 0, firstSlice)) return 1; + if (GPUSync("Neighbours Cleaner", -1, firstSlice)) return 1; StandalonePerfTime(firstSlice, 3); if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder"); AliHLTTPCCAProcessMultiA <<>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6); - if (CUDASync("Start Hits Finder", 0, firstSlice)) return 1; + if (GPUSync("Start Hits Finder", -1, firstSlice)) return 1; StandalonePerfTime(firstSlice, 4); ClearPPHitWeights <<>>(sliceCountLocal); - if (CUDASync("Clear Hit Weights", 0, firstSlice)) return 1; + if (GPUSync("Clear Hit Weights", -1, firstSlice)) return 1; for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) { @@ -1843,22 +925,22 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor"); AliHLTTPCCATrackletConstructorGPUPP<<>>(0, sliceCountLocal); - if (CUDASync("Tracklet Constructor PP", 0, firstSlice)) return 1; + if (GPUSync("Tracklet Constructor PP", -1, firstSlice)) return 1; StandalonePerfTime(firstSlice, 8); AliHLTTPCCAProcessMulti<<>>(0, sliceCountLocal); - if (CUDASync("Tracklet Selector", 0, firstSlice)) return 1; + if (GPUSync("Tracklet Selector", -1, firstSlice)) return 1; StandalonePerfTime(firstSlice, 9); - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost)); for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) { if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host"); - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost)); - CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost)); if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError) { @@ -1920,19 +1002,7 @@ int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, A return(0); } -int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m) -{ - //Initialize Slice Tracker Parameter for a slave tracker - fSlaveTrackers[iSlice].Initialize(param); - if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT) - { - HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT); - return(1); - } - return(0); -} - -int AliHLTTPCCAGPUTrackerNVCC::ExitGPU() +int AliHLTTPCCAGPUTrackerNVCC::ExitGPU_Runtime() { //Uninitialize CUDA cuCtxPushCurrent(*((CUcontext*) fCudaContext)); @@ -1954,19 +1024,12 @@ int AliHLTTPCCAGPUTrackerNVCC::ExitGPU() cudaFreeHost(fHostLockedMemory); } - if (CudaFailedMsg(cudaThreadExit())) + if (GPUFailedMsg(cudaThreadExit())) { HLTError("Could not uninitialize GPU"); return(1); } - if (StopHelperThreads()) return(1); - pthread_mutex_destroy((pthread_mutex_t*) fHelperMemMutex); - free(fHelperMemMutex); - - for (int i = 0;i < fgkNSlices;i++) pthread_mutex_destroy(&((pthread_mutex_t*) fSliceGlobalMutexes)[i]); - free(fSliceGlobalMutexes); - cuCtxDestroy(*((CUcontext*) fCudaContext)); cudaDeviceReset(); @@ -1976,100 +1039,6 @@ int AliHLTTPCCAGPUTrackerNVCC::ExitGPU() return(0); } -void AliHLTTPCCAGPUTrackerNVCC::ResetHelperThreads(int helpers) -{ - HLTImportant("Error occurred, GPU tracker helper threads will be reset (Number of threads %d/%d)", fNHelperThreads, fNCPUTrackers); - for (int i = 0;i < fNHelperThreads + fNCPUTrackers;i++) - { - fHelperParams[i].fReset = true; - if (helpers || i >= fNHelperThreads) pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]); - } - HLTImportant("GPU Tracker helper threads have ben reset"); -} - -int AliHLTTPCCAGPUTrackerNVCC::StopHelperThreads() -{ - if (fNSlaveThreads) - { - for (int i = 0;i < fNSlaveThreads;i++) - { - fHelperParams[i].fTerminate = true; - if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[0])) - { - HLTError("Error unlocking mutex to terminate slave"); - return(1); - } - if (pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1])) - { - HLTError("Error locking mutex"); - return(1); - } - if (pthread_join( *((pthread_t*) fHelperParams[i].fThreadId), NULL)) - { - HLTError("Error waiting for thread to terminate"); - return(1); - } - free(fHelperParams[i].fThreadId); - for (int j = 0;j < 2;j++) - { - if (pthread_mutex_unlock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j])) - { - HLTError("Error unlocking mutex before destroying"); - return(1); - } - pthread_mutex_destroy(&((pthread_mutex_t*) fHelperParams[i].fMutex)[j]); - } - free(fHelperParams[i].fMutex); - } - delete[] fHelperParams; - } - fNSlaveThreads = 0; - return(0); -} - -void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val) -{ - //Set Output Control Pointers - fOutputControl = val; - for (int i = 0;i < fgkNSlices;i++) - { - fSlaveTrackers[i].SetOutputControl(val); - } -} - -int AliHLTTPCCAGPUTrackerNVCC::GetThread() -{ - //Get Thread ID -#ifdef R__WIN32 - return((int) (size_t) GetCurrentThread()); -#else - return((int) syscall (SYS_gettid)); -#endif -} - -unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i) -{ - //Returns pointer to PerfTimer i of slice iSlice - return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL); -} - -const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const -{ - //Return Pointer to Output Control Structure - return fOutputControl; -} - -int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const -{ - //Return max slice count processable - return(fSliceCount); -} - -char* AliHLTTPCCAGPUTrackerNVCC::MergerBaseMemory() -{ - return(alignPointer((char*) fGPUMergerHostMemory, 1024 * 1024)); -} - int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger) { #ifndef HLTCA_GPU_MERGER @@ -2102,7 +1071,6 @@ int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger) AssignMemory(field, gpumem, 6); AssignMemory(param, gpumem, 1); - if ((size_t) (gpumem - (char*) fGPUMergerMemory) > (size_t) fGPUMergerMaxMemory) { HLTError("Insufficiant GPU Merger Memory"); @@ -2112,25 +1080,25 @@ int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger) if (fDebugLevel >= 2) HLTInfo("Running GPU Merger (%d/%d)", Merger->NOutputTrackClusters(), Merger->NClusters()); AliHLTTPCCATracker::StandaloneQueryTime(&a); - CudaFailedMsg(cudaMemcpy(X, Merger->ClusterX(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); - CudaFailedMsg(cudaMemcpy(Y, Merger->ClusterY(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); - CudaFailedMsg(cudaMemcpy(Z, Merger->ClusterZ(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); - CudaFailedMsg(cudaMemcpy(Angle, Merger->ClusterAngle(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); - CudaFailedMsg(cudaMemcpy(RowType, Merger->ClusterRowType(), Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyHostToDevice)); - CudaFailedMsg(cudaMemcpy(tracks, Merger->OutputTracks(), Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyHostToDevice)); - CudaFailedMsg(cudaMemcpy(field, Merger->PolinomialFieldBz(), 6 * sizeof(float), cudaMemcpyHostToDevice)); - CudaFailedMsg(cudaMemcpy(param, fSlaveTrackers[0].pParam(), sizeof(AliHLTTPCCAParam), cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(X, Merger->ClusterX(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(Y, Merger->ClusterY(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(Z, Merger->ClusterZ(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(Angle, Merger->ClusterAngle(), Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(RowType, Merger->ClusterRowType(), Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(tracks, Merger->OutputTracks(), Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(field, Merger->PolinomialFieldBz(), 6 * sizeof(float), cudaMemcpyHostToDevice)); + GPUFailedMsg(cudaMemcpy(param, fSlaveTrackers[0].pParam(), sizeof(AliHLTTPCCAParam), cudaMemcpyHostToDevice)); AliHLTTPCCATracker::StandaloneQueryTime(&b); RefitTracks<<>>(tracks, Merger->NOutputTracks(), field, X, Y, Z, RowType, Angle, param); - CudaFailedMsg(cudaThreadSynchronize()); + GPUFailedMsg(cudaThreadSynchronize()); AliHLTTPCCATracker::StandaloneQueryTime(&c); - CudaFailedMsg(cudaMemcpy(Merger->ClusterX(), X, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); - CudaFailedMsg(cudaMemcpy(Merger->ClusterY(), Y, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); - CudaFailedMsg(cudaMemcpy(Merger->ClusterZ(), Z, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); - CudaFailedMsg(cudaMemcpy(Merger->ClusterAngle(), Angle, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); - CudaFailedMsg(cudaMemcpy(Merger->ClusterRowType(), RowType, Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyDeviceToHost)); - CudaFailedMsg(cudaMemcpy((void*) Merger->OutputTracks(), tracks, Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyDeviceToHost)); - CudaFailedMsg(cudaThreadSynchronize()); + GPUFailedMsg(cudaMemcpy(Merger->ClusterX(), X, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(Merger->ClusterY(), Y, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(Merger->ClusterZ(), Z, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(Merger->ClusterAngle(), Angle, Merger->NOutputTrackClusters() * sizeof(float), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy(Merger->ClusterRowType(), RowType, Merger->NOutputTrackClusters() * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaMemcpy((void*) Merger->OutputTracks(), tracks, Merger->NOutputTracks() * sizeof(AliHLTTPCGMMergedTrack), cudaMemcpyDeviceToHost)); + GPUFailedMsg(cudaThreadSynchronize()); AliHLTTPCCATracker::StandaloneQueryTime(&d); if (fDebugLevel >= 2) HLTInfo("GPU Merger Finished"); @@ -2149,9 +1117,23 @@ int AliHLTTPCCAGPUTrackerNVCC::RefitMergedTracks(AliHLTTPCGMMerger* Merger) #endif } -int AliHLTTPCCAGPUTrackerNVCC::IsInitialized() +int AliHLTTPCCAGPUTrackerNVCC::GPUMergerAvailable() +{ + return(1); +} + +void AliHLTTPCCAGPUTrackerNVCC::ActivateThreadContext() { - return(fCudaInitialized); + cuCtxPushCurrent(*((CUcontext*) fCudaContext)); +} +void AliHLTTPCCAGPUTrackerNVCC::ReleaseThreadContext() +{ + cuCtxPopCurrent((CUcontext*) fCudaContext); +} + +void AliHLTTPCCAGPUTrackerNVCC::SynchronizeGPU() +{ + cudaThreadSynchronize(); } AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate() diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch deleted file mode 100755 index 3202e11a7f0..00000000000 --- a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch +++ /dev/null @@ -1,122 +0,0 @@ ---- AliHLTTPCCAGPUTracker.cucpp 2009-05-28 12:14:09.000000000 +0200 -+++ release/x86_64-pc-linux-gnu/code/AliHLTTPCCAGPUTracker.cucpp 2009-05-28 12:10:25.000000000 +0200 -@@ -1530,10 +1530,10 @@ - extern "C" { extern int getdate_err; } - extern "C" tm *getdate(const char *); - extern "C" int getdate_r(const char *__restrict__, tm *__restrict__); --extern "C" { extern inline __attribute__((__weak__)) void *memcpy(void *__restrict__, const void *__restrict__, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); } -+extern "C" { extern inline void *memcpy(void *__restrict__, const void *__restrict__, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); } - extern "C" { extern inline void *memmove(void *, const void *, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); } - extern "C" void *memccpy(void *__restrict__, const void *__restrict__, int, size_t) throw() __attribute__((nonnull(1))) __attribute__((nonnull(2))); --extern "C" { extern inline __attribute__((__weak__)) void *memset(void *, int, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))); } -+extern "C" { extern inline void *memset(void *, int, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))); } - extern "C" int memcmp(const void *, const void *, size_t) throw() __attribute__((__pure__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); - extern inline void *memchr(void *, int, size_t) throw() __asm__("memchr") __attribute__((__pure__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))); - extern inline const void *memchr(const void *, int, size_t) throw() __asm__("memchr") __attribute__((__pure__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))); -@@ -1661,7 +1661,7 @@ - char *basename(char *) throw() __asm__("basename") __attribute__((nonnull(1))); - const char *basename(const char *) throw() __asm__("basename") __attribute__((nonnull(1))); - extern "C" void __warn_memset_zero_len(); --extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))) void *memcpy(void *__restrict__ __dest, const void *__restrict__ __src, size_t __len) throw() -+extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))) void *memcpy(void *__restrict__ __dest, const void *__restrict__ __src, size_t __len) throw() - { - return __builtin___memcpy_chk(__dest, __src, __len, __builtin_object_size(__dest, 0)); - } } -@@ -1673,7 +1673,7 @@ - { - return __builtin___mempcpy_chk(__dest, __src, __len, __builtin_object_size(__dest, 0)); - } } --extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) void *memset(void *__dest, int __ch, size_t __len) throw() -+extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) void *memset(void *__dest, int __ch, size_t __len) throw() - { - if (((0) && (__len == (0))) && ((!(0)) || (__ch != 0))) - { -@@ -1719,8 +1719,6 @@ - return __builtin___strncat_chk(__dest, __src, __len, __builtin_object_size(__dest, 2 > 1)); - } } - extern "C" __attribute__((__weak__)) clock_t clock() throw(); --extern "C" { extern inline __attribute__((__weak__)) void *memset(void *, int, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))); } --extern "C" { extern inline __attribute__((__weak__)) void *memcpy(void *, const void *, size_t) throw() __attribute__((__gnu_inline__)) __attribute__((__always_inline__)) __attribute__((nonnull(1))) __attribute__((nonnull(2))); } - extern "C" __attribute__((__weak__)) int abs(int) throw() __attribute__((__warn_unused_result__)) __attribute__((__const__)); - extern "C" __attribute__((__weak__)) long labs(long) throw() __attribute__((__warn_unused_result__)) __attribute__((__const__)); - extern "C" __attribute__((__weak__)) long long llabs(long long) throw() __attribute__((__warn_unused_result__)) __attribute__((__const__)); -@@ -1862,11 +1860,8 @@ - extern "C" __attribute__((__weak__)) int __isnanf(float) throw() __attribute__((__const__)); - extern "C" __attribute__((__weak__)) int __finite(double) throw() __attribute__((__const__)); - extern "C" __attribute__((__weak__)) int __finitef(float) throw() __attribute__((__const__)); --extern "C" { extern inline __attribute__((__weak__)) int __signbit(double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); } --extern "C" { extern inline __attribute__((__weak__)) int __signbitf(float) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); } - extern "C" __attribute__((__weak__)) double fma(double, double, double) throw(); - extern "C" __attribute__((__weak__)) float fmaf(float, float, float) throw(); --extern "C" { extern inline __attribute__((__weak__)) int __signbitl(long double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); } - extern "C" __attribute__((__weak__)) int __isinfl(long double) throw() __attribute__((__const__)); - extern "C" __attribute__((__weak__)) int __isnanl(long double) throw() __attribute__((__const__)); - extern "C" __attribute__((__weak__)) int __finitel(long double) throw() __attribute__((__const__)); -@@ -1948,7 +1943,7 @@ - extern "C" __attribute__((__weak__)) double fmax(double, double) throw(); extern "C" double __fmax(double, double) throw(); - extern "C" __attribute__((__weak__)) double fmin(double, double) throw(); extern "C" double __fmin(double, double) throw(); - extern "C" int __fpclassify(double) throw() __attribute__((__const__)); --extern "C" { extern inline __attribute__((__weak__)) int __signbit(double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); } -+extern "C" { extern inline int __signbit(double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); } - extern "C" __attribute__((__weak__)) double fma(double, double, double) throw(); extern "C" double __fma(double, double, double) throw(); - extern "C" double scalb(double, double) throw(); extern "C" double __scalb(double, double) throw(); - extern "C" __attribute__((__weak__)) float acosf(float) throw(); extern "C" float __acosf(float) throw(); -@@ -2027,7 +2022,7 @@ - extern "C" __attribute__((__weak__)) float fmaxf(float, float) throw(); extern "C" float __fmaxf(float, float) throw(); - extern "C" __attribute__((__weak__)) float fminf(float, float) throw(); extern "C" float __fminf(float, float) throw(); - extern "C" int __fpclassifyf(float) throw() __attribute__((__const__)); --extern "C" { extern inline __attribute__((__weak__)) int __signbitf(float) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); } -+extern "C" { extern inline int __signbitf(float) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); } - extern "C" __attribute__((__weak__)) float fmaf(float, float, float) throw(); extern "C" float __fmaf(float, float, float) throw(); - extern "C" float scalbf(float, float) throw(); extern "C" float __scalbf(float, float) throw(); - extern "C" long double acosl(long double) throw(); extern "C" long double __acosl(long double) throw(); -@@ -2106,7 +2101,7 @@ - extern "C" long double fmaxl(long double, long double) throw(); extern "C" long double __fmaxl(long double, long double) throw(); - extern "C" long double fminl(long double, long double) throw(); extern "C" long double __fminl(long double, long double) throw(); - extern "C" int __fpclassifyl(long double) throw() __attribute__((__const__)); --extern "C" { extern inline __attribute__((__weak__)) int __signbitl(long double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); } -+extern "C" { extern inline int __signbitl(long double) throw() __attribute__((__gnu_inline__)) __attribute__((__const__)); } - extern "C" long double fmal(long double, long double, long double) throw(); extern "C" long double __fmal(long double, long double, long double) throw(); - extern "C" long double scalbl(long double, long double) throw(); extern "C" long double __scalbl(long double, long double) throw(); - extern "C" { extern int signgam; } -@@ -2134,19 +2129,19 @@ - double retval; - }; } - extern "C" int matherr(__exception *) throw(); --extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitf(float __x) throw() -+extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitf(float __x) throw() - { - int __m; - __asm__("pmovmskb %1, %0" : "=r" (__m) : "x" (__x)); - return __m & 8; - } } --extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbit(double __x) throw() -+extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbit(double __x) throw() - { - int __m; - __asm__("pmovmskb %1, %0" : "=r" (__m) : "x" (__x)); - return __m & 128; - } } --extern "C" { inline __attribute__((__weak__)) __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitl(long double __x) throw() -+extern "C" { inline __attribute__((__gnu_inline__)) __attribute__((__const__)) int __signbitl(long double __x) throw() - { - union { long double __l; int __i[3]; } __u = {__l: __x}; - return (((__u.__i)[2]) & 32768) != 0; -@@ -9864,7 +9859,7 @@ - { - __c_locale __old = __gnu_cxx::__uselocale(__cloc); - __builtin_va_list __args; --__builtin_stdarg_start(__args,__fmt); -+__builtin_va_start(__args,__fmt); - const int __ret = __builtin_vsnprintf(__out, __size, __fmt, __args); - __builtin_va_end(__args); - __gnu_cxx::__uselocale(__old); -@@ -23186,7 +23186,7 @@ - static T2 *Alloc(int s) { auto T2 *p = (reinterpret_cast< T2 *>(_mm_malloc(s * sizeof(CacheLineSizeHelper< T> ), 128))); return new (p) T2 [s]; } - static void Free(T2 *const p, int size) { - for (int i = 0; i < size; ++i) { --((p[i]).~CacheLineSizeHelper()); -+((p[i]).~T2()); - } - _mm_free(p); - } diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.h index 11e7d842d00..073aa5c70dd 100755 --- a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.h +++ b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerNVCC.h @@ -18,159 +18,33 @@ #ifndef ALIHLTTPCCAGPUTRACKERNVCC_H #define ALIHLTTPCCAGPUTRACKERNVCC_H -#include "AliHLTTPCCAGPUTracker.h" -#include "AliHLTTPCCADef.h" -#include "AliHLTTPCCATracker.h" -#include "AliHLTLogging.h" -#include "AliHLTTPCCASliceOutput.h" +#include "AliHLTTPCCAGPUTrackerBase.h" -#ifdef __CINT__ -typedef int cudaError_t -#elif defined(R__WIN32) -#include "../cmodules/pthread_mutex_win32_wrapper.h" -#else -#include -#include -#endif - -class AliHLTTPCCARow; - -class AliHLTTPCCAGPUTrackerNVCC : public AliHLTTPCCAGPUTracker, public AliHLTLogging +class AliHLTTPCCAGPUTrackerNVCC : public AliHLTTPCCAGPUTrackerBase { - friend void* helperWrapper(void*); public: AliHLTTPCCAGPUTrackerNVCC(); virtual ~AliHLTTPCCAGPUTrackerNVCC(); - virtual int InitGPU(int sliceCount = -1, int forceDeviceID = -1); - virtual int IsInitialized(); + virtual int InitGPU_Runtime(int sliceCount = -1, int forceDeviceID = -1); virtual int Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1); - int ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1); - int SelfHealReconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1); - virtual int ExitGPU(); - - virtual void SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile = NULL); - virtual int SetGPUTrackerOption(char* OptionName, int OptionValue); - - virtual unsigned long long int* PerfTimer(int iSlice, unsigned int i); - - virtual int InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m); - virtual void SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val); - - virtual const AliHLTTPCCASliceOutput::outputControlStruct* OutputControl() const; - virtual int GetSliceCount() const; - + virtual int ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1); + virtual int ExitGPU_Runtime(); virtual int RefitMergedTracks(AliHLTTPCGMMerger* Merger); - virtual char* MergerBaseMemory(); - -private: - struct helperParam - { - void* fThreadId; - AliHLTTPCCAGPUTrackerNVCC* fCls; - int fNum; - int fSliceCount; - AliHLTTPCCAClusterData* pClusterData; - AliHLTTPCCASliceOutput** pOutput; - int fFirstSlice; - void* fMutex; - bool fTerminate; - int fPhase; - int CPUTracker; - volatile int fDone; - volatile bool fReset; - }; - - static void* RowMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + iSlice * sizeof(AliHLTTPCCARow) * (HLTCA_ROW_COUNT + 1) ); } - static void* CommonMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + iSlice * AliHLTTPCCATracker::CommonMemorySize() ); } - static void* SliceDataMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + iSlice * HLTCA_GPU_SLICE_DATA_MEMORY ); } - void* GlobalMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_GLOBAL_MEMORY ); } - void* TracksMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_TRACKS_MEMORY ); } - void* TrackerMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + iSlice * sizeof(AliHLTTPCCATracker) ); } - - void ReadEvent(AliHLTTPCCAClusterData* pClusterData, int firstSlice, int iSlice, int threadId); - void WriteOutput(AliHLTTPCCASliceOutput** pOutput, int firstSlice, int iSlice, int threadId); - int GlobalTracking(int iSlice, int threadId, helperParam* hParam); + virtual int GPUMergerAvailable(); - int StartHelperThreads(); - int StopHelperThreads(); - void ResetHelperThreads(int helpers); - void ResetThisHelperThread(AliHLTTPCCAGPUTrackerNVCC::helperParam* par); +protected: + virtual void ActivateThreadContext(); + virtual void ReleaseThreadContext(); + virtual void SynchronizeGPU(); + virtual int GPUSync(char* state = "UNKNOWN", int stream = -1, int slice = 0); +private: void DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check = true); - int GetThread(); - void ReleaseGlobalLock(void* sem); - int CheckMemorySizes(int sliceCount); - - int CUDASync(char* state = "UNKNOWN", int sliceLocal = 0, int slice = 0); - template T* alignPointer(T* ptr, int alignment); - void StandalonePerfTime(int iSlice, int i); -#define CudaFailedMsg(x) CudaFailedMsgA(x, __FILE__, __LINE__) - bool CudaFailedMsgA(cudaError_t error, const char* file, int line); - - static void* helperWrapper(void*); - - AliHLTTPCCATracker *fGpuTracker; //Tracker Objects that will be used on the GPU - void* fGPUMemory; //Pointer to GPU Memory Base Adress - void* fHostLockedMemory; //Pointer to Base Adress of Page Locked Host Memory for DMA Transfer - - void* fGPUMergerMemory; - void* fGPUMergerHostMemory; - int fGPUMergerMaxMemory; - - int fDebugLevel; //Debug Level for GPU Tracker - unsigned int fDebugMask; //Mask which Debug Data is written to file - std::ostream* fOutFile; //Debug Output Stream Pointer - unsigned long long int fGPUMemSize; //Memory Size to allocate on GPU - - void* fpCudaStreams; //Pointer to array of CUDA Streams - int fSliceCount; //Maximum Number of Slices this GPU tracker can process in parallel - int fCudaDevice; //CUDA device used by GPU tracker - - static const int fgkNSlices = 36; //Number of Slices in Alice - AliHLTTPCCATracker fSlaveTrackers[fgkNSlices]; //CPU Slave Trackers for Initialization and Output - - AliHLTTPCCASliceOutput::outputControlStruct* fOutputControl; //Output Control Structure - - int fThreadId; //Thread ID that is valid for the local CUDA context - int fCudaInitialized; //Flag if CUDA is initialized - - int fPPMode; //Flag if GPU tracker runs in PP Mode - int fSelfheal; //Reinitialize GPU on failure - - int fConstructorBlockCount; //GPU blocks used in Tracklet Constructor - int selectorBlockCount; //GPU blocks used in Tracklet Selector - -#ifdef HLTCA_GPU_TIME_PROFILE - unsigned long long int fProfTimeC, fProfTimeD; //Timing -#endif - void* fCudaContext; //Pointer to CUDA context - - int fNHelperThreads; //Number of helper threads for post/preprocessing - helperParam* fHelperParams; //Control Struct for helper threads - void* fHelperMemMutex; - -#ifdef __ROOT__ -#define volatile -#endif - volatile int fSliceOutputReady; - volatile char fSliceLeftGlobalReady[fgkNSlices]; - volatile char fSliceRightGlobalReady[fgkNSlices]; -#ifdef __ROOT__ -#undef volatile -#endif - void* fSliceGlobalMutexes; - char fGlobalTrackingDone[fgkNSlices]; - char fWriteOutputDone[fgkNSlices]; + bool GPUFailedMsgA(cudaError_t error, const char* file, int line); - int fNCPUTrackers; //Number of CPU trackers to use - int fNSlicesPerCPUTracker; //Number of slices processed by each CPU tracker - - int fGlobalTracking; //Use Global Tracking - int fUseGlobalTracking; - - int fNSlaveThreads; //Number of slave threads currently active + void* fpCudaStreams; //Pointer to array of CUDA Streams // disable copy AliHLTTPCCAGPUTrackerNVCC( const AliHLTTPCCAGPUTrackerNVCC& ); diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cl b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cl new file mode 100644 index 00000000000..4e7a63fd71b --- /dev/null +++ b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cl @@ -0,0 +1,113 @@ +#define __OPENCL__ +#define RADEON + +//Disable assertions since they produce errors in GPU Code +#ifdef assert +#undef assert +#endif +#define assert(param) + +#include "AliHLTTPCCATrackParam.cxx" +#include "AliHLTTPCCATrack.cxx" + +#include "AliHLTTPCCAHitArea.cxx" +#include "AliHLTTPCCAGrid.cxx" +#include "AliHLTTPCCARow.cxx" +#include "AliHLTTPCCAParam.cxx" +#include "AliHLTTPCCATracker.cxx" + +#include "AliHLTTPCCATrackletSelector.cxx" +#include "AliHLTTPCCANeighboursFinder.cxx" +#include "AliHLTTPCCANeighboursCleaner.cxx" +#include "AliHLTTPCCAStartHitsFinder.cxx" +#include "AliHLTTPCCAStartHitsSorter.cxx" +#include "AliHLTTPCCATrackletConstructor.cxx" + +__kernel void PreInitRowBlocks(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int iSlice) +{ + GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[iSlice]; + if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return; + + //Initialize GPU RowBlocks and HitWeights + const int nSliceDataHits = pTracker.Data().NumberOfHitsPlusAlign(); + __global int4* SliceDataHitWeights4 = (__global int4*) pTracker.Data().HitWeights(); + + const int stride = get_global_size(0); + int4 i0; + i0.x = i0.y = i0.z = i0.w = 0; + for (int i = get_global_id(0);i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride) + SliceDataHitWeights4[i] = i0; +} + +GPUg() void AliHLTTPCCAProcess_AliHLTTPCCANeighboursFinder(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int iSlice) +{ + GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[iSlice]; + if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return; + GPUshared() typename AliHLTTPCCANeighboursFinder::MEM_LOCAL(AliHLTTPCCASharedMemory) smem; + + for( int iSync=0; iSync<=AliHLTTPCCANeighboursFinder::NThreadSyncPoints(); iSync++){ + GPUsync(); + AliHLTTPCCANeighboursFinder::Thread( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), iSync, smem, pTracker ); + } +} + +GPUg() void AliHLTTPCCAProcess_AliHLTTPCCANeighboursCleaner(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int iSlice) +{ + GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[iSlice]; + if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return; + GPUshared() typename AliHLTTPCCANeighboursCleaner::MEM_LOCAL(AliHLTTPCCASharedMemory) smem; + + for( int iSync=0; iSync<=AliHLTTPCCANeighboursCleaner::NThreadSyncPoints(); iSync++){ + GPUsync(); + AliHLTTPCCANeighboursCleaner::Thread( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), iSync, smem, pTracker ); + } +} + +GPUg() void AliHLTTPCCAProcess_AliHLTTPCCAStartHitsFinder(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int iSlice) +{ + GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[iSlice]; + if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return; + GPUshared() typename AliHLTTPCCAStartHitsFinder::MEM_LOCAL(AliHLTTPCCASharedMemory) smem; + + for( int iSync=0; iSync<=AliHLTTPCCAStartHitsFinder::NThreadSyncPoints(); iSync++){ + GPUsync(); + AliHLTTPCCAStartHitsFinder::Thread( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), iSync, smem, pTracker ); + } +} + +GPUg() void AliHLTTPCCAProcess_AliHLTTPCCAStartHitsSorter(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int iSlice) +{ + GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[iSlice]; + if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return; + GPUshared() typename AliHLTTPCCAStartHitsSorter::MEM_LOCAL(AliHLTTPCCASharedMemory) smem; + + for( int iSync=0; iSync<=AliHLTTPCCAStartHitsSorter::NThreadSyncPoints(); iSync++){ + GPUsync(); + AliHLTTPCCAStartHitsSorter::Thread( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), iSync, smem, pTracker ); + } +} + +GPUg() void AliHLTTPCCAProcessMulti_AliHLTTPCCATrackletSelector(__global char* gpu_mem, GPUconstant() void* pTrackerTmp, int firstSlice, int nSliceCount) +{ + const int iSlice = nSliceCount * (get_group_id(0) + (get_num_groups(0) % nSliceCount != 0 && nSliceCount * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0); + const int nSliceBlockOffset = get_num_groups(0) * iSlice / nSliceCount; + const int sliceBlockId = get_group_id(0) - nSliceBlockOffset; + const int sliceGridDim = get_num_groups(0) * (iSlice + 1) / nSliceCount - get_num_groups(0) * (iSlice) / nSliceCount; + GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &pTracker = (( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp)[firstSlice + iSlice]; + if (gpu_mem != pTracker.GPUParametersConst()->fGPUMem) return; + GPUshared() typename AliHLTTPCCATrackletSelector::MEM_LOCAL(AliHLTTPCCASharedMemory) smem; + + for( int iSync=0; iSync<=AliHLTTPCCATrackletSelector::NThreadSyncPoints(); iSync++){ + GPUsync(); + AliHLTTPCCATrackletSelector::Thread( sliceGridDim, get_local_size(0), sliceBlockId, get_local_id(0), iSync, smem, pTracker ); + } +} + +GPUg() void AliHLTTPCCATrackletConstructorGPU(__global char* gpu_mem, GPUconstant() void* pTrackerTmp) +{ + //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU + GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) *pTracker = ( GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) * ) pTrackerTmp ; + if (gpu_mem != pTracker[0].GPUParametersConst()->fGPUMem) return; + GPUshared() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory) sMem; + AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(pTracker, sMem); +} diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cxx b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cxx new file mode 100644 index 00000000000..ba497ba5f66 --- /dev/null +++ b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.cxx @@ -0,0 +1,810 @@ +// ************************************************************************** +// This file is property of and copyright by the ALICE HLT Project * +// ALICE Experiment at CERN, All rights reserved. * +// * +// Primary Authors: Sergey Gorbunov * +// Ivan Kisel * +// David Rohr * +// for The ALICE HLT Project. * +// * +// Permission to use, copy, modify and distribute this software and its * +// documentation strictly for non-commercial purposes is hereby granted * +// without fee, provided that the above copyright notice appears in all * +// copies and that both the copyright notice and this permission notice * +// appear in the supporting documentation. The authors make no claims * +// about the suitability of this software for any purpose. It is * +// provided "as is" without express or implied warranty. * +// * +//*************************************************************************** + +#define __OPENCL__ +#define RADEON +#define HLTCA_HOSTCODE + +#include +#include "AliHLTTPCCAGPUTrackerOpenCL.h" +#include "AliHLTTPCCAGPUTrackerOpenCLInternals.h" +#include "AliHLTTPCCAGPUTrackerCommon.h" + +#include "AliHLTTPCCATrackParam.h" +#include "AliHLTTPCCATrack.h" + +#include "AliHLTTPCCAHitArea.h" +#include "AliHLTTPCCAGrid.h" +#include "AliHLTTPCCARow.h" +#include "AliHLTTPCCAParam.h" +#include "AliHLTTPCCATracker.h" + +#include "AliHLTTPCCAProcess.h" + +#include "AliHLTTPCCATrackletSelector.h" +#include "AliHLTTPCCANeighboursFinder.h" +#include "AliHLTTPCCANeighboursCleaner.h" +#include "AliHLTTPCCAStartHitsFinder.h" +#include "AliHLTTPCCAStartHitsSorter.h" +#include "AliHLTTPCCATrackletConstructor.h" +#include "AliHLTTPCCAClusterData.h" + +#include "../makefiles/opencl_obtain_program.h" +extern "C" char _makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl[]; + +ClassImp( AliHLTTPCCAGPUTrackerOpenCL ) + +AliHLTTPCCAGPUTrackerOpenCL::AliHLTTPCCAGPUTrackerOpenCL() : ocl(NULL) +{ + ocl = new AliHLTTPCCAGPUTrackerOpenCLInternals; + if (ocl == NULL) + { + HLTError("Memory Allocation Error"); + } + ocl->mem_host_ptr = NULL; + ocl->selector_events = NULL; + ocl->devices = NULL; +}; + +AliHLTTPCCAGPUTrackerOpenCL::~AliHLTTPCCAGPUTrackerOpenCL() +{ + delete[] ocl; +}; + +#define quit(msg) {HLTError(msg);return(1);} + +int AliHLTTPCCAGPUTrackerOpenCL::InitGPU_Runtime(int sliceCount, int forceDeviceID) +{ + //Find best OPENCL device, initialize and allocate memory + + cl_int ocl_error; + cl_uint num_platforms; + if (clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS) quit("Error getting OpenCL Platform Count"); + if (num_platforms == 0) quit("No OpenCL Platform found"); + if (fDebugLevel >= 2) HLTInfo("%d OpenCL Platforms found", num_platforms); + + //Query platforms + cl_platform_id* platforms = new cl_platform_id[num_platforms]; + if (platforms == NULL) quit("Memory allocation error"); + if (clGetPlatformIDs(num_platforms, platforms, NULL) != CL_SUCCESS) quit("Error getting OpenCL Platforms"); + + cl_platform_id platform; + bool found = false; + for (unsigned int i_platform = 0;i_platform < num_platforms;i_platform++) + { + char platform_profile[64], platform_version[64], platform_name[64], platform_vendor[64]; + clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_PROFILE, 64, platform_profile, NULL); + clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VERSION, 64, platform_version, NULL); + clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_NAME, 64, platform_name, NULL); + clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VENDOR, 64, platform_vendor, NULL); + if (fDebugLevel >= 2) {HLTDebug("Available Platform %d: (%s %s) %s %s\n", i_platform, platform_profile, platform_version, platform_vendor, platform_name);} + if (strcmp(platform_vendor, "Advanced Micro Devices, Inc.") == 0) + { + found = true; + if (fDebugLevel >= 2) HLTInfo("AMD OpenCL Platform found"); + platform = platforms[i_platform]; + break; + } + } + if (found == false) + { + HLTError("Did not find AMD OpenCL Platform"); + return(1); + } + + cl_uint count, bestDevice = (cl_uint) -1; + long long int bestDeviceSpeed = 0, deviceSpeed; + if (GPUFailedMsg(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &count))) + { + HLTError("Error getting OPENCL Device Count"); + return(1); + } + + //Query devices + ocl->devices = new cl_device_id[count]; + if (ocl->devices == NULL) quit("Memory allocation error"); + if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, count, ocl->devices, NULL) != CL_SUCCESS) quit("Error getting OpenCL devices"); + + char device_vendor[64], device_name[64]; + cl_device_type device_type; + cl_uint freq, shaders; + + if (fDebugLevel >= 2) HLTInfo("Available OPENCL devices:"); + for (unsigned int i = 0;i < count;i++) + { + if (fDebugLevel >= 3) {HLTDebug("Examining device %d\n", i);} + cl_uint nbits; + + clGetDeviceInfo(ocl->devices[i], CL_DEVICE_NAME, 64, device_name, NULL); + clGetDeviceInfo(ocl->devices[i], CL_DEVICE_VENDOR, 64, device_vendor, NULL); + clGetDeviceInfo(ocl->devices[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); + clGetDeviceInfo(ocl->devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL); + clGetDeviceInfo(ocl->devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(shaders), &shaders, NULL); + clGetDeviceInfo(ocl->devices[i], CL_DEVICE_ADDRESS_BITS, sizeof(nbits), &nbits, NULL); + //if (device_type & CL_DEVICE_TYPE_CPU) continue; + //if (!(device_type & CL_DEVICE_TYPE_GPU)) continue; + if (nbits / 8 != sizeof(void*)) continue; + + deviceSpeed = (long long int) freq * (long long int) shaders; + if (device_type & CL_DEVICE_TYPE_GPU) deviceSpeed *= 10; + if (fDebugLevel >= 2) {HLTDebug("Found Device %d: %s %s (Frequency %d, Shaders %d, %d bit) (Speed Value: %lld)\n", i, device_vendor, device_name, (int) freq, (int) shaders, (int) nbits, (long long int) deviceSpeed);} + + if (deviceSpeed > bestDeviceSpeed) + { + bestDevice = i; + bestDeviceSpeed = deviceSpeed; + } + } + if (bestDevice == (cl_uint) -1) + { + HLTWarning("No %sOPENCL Device available, aborting OPENCL Initialisation", count ? "appropriate " : ""); + return(1); + } + + if (forceDeviceID > -1 && forceDeviceID < (signed) count) bestDevice = forceDeviceID; + ocl->device = ocl->devices[bestDevice]; + + clGetDeviceInfo(ocl->device, CL_DEVICE_NAME, 64, device_name, NULL); + clGetDeviceInfo(ocl->device, CL_DEVICE_VENDOR, 64, device_vendor, NULL); + clGetDeviceInfo(ocl->device, CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); + clGetDeviceInfo(ocl->device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL); + clGetDeviceInfo(ocl->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(shaders), &shaders, NULL); + if (fDebugLevel >= 2) {HLTDebug("Using OpenCL device %d: %s %s (Frequency %d, Shaders %d)\n", bestDevice, device_vendor, device_name, (int) freq, (int) shaders);} + + cl_uint compute_units; + clGetDeviceInfo(ocl->device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &compute_units, NULL); + + fConstructorBlockCount = compute_units * HLTCA_GPU_BLOCK_COUNT_CONSTRUCTOR_MULTIPLIER; + selectorBlockCount = compute_units * HLTCA_GPU_BLOCK_COUNT_SELECTOR_MULTIPLIER; + + ocl->context = clCreateContext(NULL, count, ocl->devices, NULL, NULL, &ocl_error); + if (ocl_error != CL_SUCCESS) + { + HLTError("Could not create OPENCL Device Context!"); + return(1); + } + + //Workaround to compile CL kernel during tracker initialization + /*{ + char* file = "cagpubuild/AliHLTTPCCAGPUTrackerOpenCL.cl"; + HLTDebug("Reading source file %s\n", file); + FILE* fp = fopen(file, "rb"); + if (fp == NULL) + { + HLTDebug("Cannot open %s\n", file); + return(1); + } + fseek(fp, 0, SEEK_END); + size_t file_size = ftell(fp); + fseek(fp, 0, SEEK_SET); + + char* buffer = (char*) malloc(file_size + 1); + if (buffer == NULL) + { + quit("Memory allocation error"); + } + if (fread(buffer, 1, file_size, fp) != file_size) + { + quit("Error reading file"); + } + buffer[file_size] = 0; + fclose(fp); + + HLTDebug("Creating OpenCL Program Object\n"); + //Create OpenCL program object + ocl->program = clCreateProgramWithSource(ocl->context, (cl_uint) 1, (const char**) &buffer, NULL, &ocl_error); + if (ocl_error != CL_SUCCESS) quit("Error creating program object"); + + HLTDebug("Compiling OpenCL Program\n"); + //Compile program + ocl_error = clBuildProgram(ocl->program, count, ocl->devices, "-I. -Iinclude -Icode -Ibase -Imerger-ca -Icagpubuild -I/home/qon/AMD-APP-SDK-v2.8.1.0-RC-lnx64/include -I/usr/local/cuda/include -DHLTCA_STANDALONE -DBUILD_GPU -D_64BIT -x clc++", NULL, NULL); + if (ocl_error != CL_SUCCESS) + { + HLTDebug("OpenCL Error while building program: %d (Compiler options: %s)\n", ocl_error, ""); + + for (unsigned int i = 0;i < count;i++) + { + cl_build_status status; + clGetProgramBuildInfo(ocl->program, ocl->devices[i], CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL); + if (status == CL_BUILD_ERROR) + { + size_t log_size; + clGetProgramBuildInfo(ocl->program, ocl->devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char* build_log = (char*) malloc(log_size + 1); + if (build_log == NULL) quit("Memory allocation error"); + clGetProgramBuildInfo(ocl->program, ocl->devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); + HLTDebug("Build Log (device %d):\n\n%s\n\n", i, build_log); + free(build_log); + } + } + } + }*/ + + if (_makefiles_opencl_obtain_program_helper(ocl->context, count, ocl->devices, &ocl->program, _makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl)) + { + clReleaseContext(ocl->context); + HLTError("Could not obtain OpenCL progarm"); + return(1); + } + if (fDebugLevel >= 2) HLTInfo("OpenCL program loaded successfully"); + + ocl->kernel_row_blocks = clCreateKernel(ocl->program, "PreInitRowBlocks", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 1");return(1);} + ocl->kernel_neighbours_finder = clCreateKernel(ocl->program, "AliHLTTPCCAProcess_AliHLTTPCCANeighboursFinder", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 1");return(1);} + ocl->kernel_neighbours_cleaner = clCreateKernel(ocl->program, "AliHLTTPCCAProcess_AliHLTTPCCANeighboursCleaner", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 2");return(1);} + ocl->kernel_start_hits_finder = clCreateKernel(ocl->program, "AliHLTTPCCAProcess_AliHLTTPCCAStartHitsFinder", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 3");return(1);} + ocl->kernel_start_hits_sorter = clCreateKernel(ocl->program, "AliHLTTPCCAProcess_AliHLTTPCCAStartHitsSorter", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 4");return(1);} + ocl->kernel_tracklet_selector = clCreateKernel(ocl->program, "AliHLTTPCCAProcessMulti_AliHLTTPCCATrackletSelector", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 5");return(1);} + ocl->kernel_tracklet_constructor = clCreateKernel(ocl->program, "AliHLTTPCCATrackletConstructorGPU", &ocl_error); if (ocl_error != CL_SUCCESS) {HLTError("OPENCL Kernel Error 6");return(1);} + if (fDebugLevel >= 2) HLTInfo("OpenCL kernels created successfully"); + + ocl->mem_gpu = clCreateBuffer(ocl->context, CL_MEM_READ_WRITE, fGPUMemSize, NULL, &ocl_error); + if (ocl_error != CL_SUCCESS) + { + HLTError("OPENCL Memory Allocation Error"); + clReleaseContext(ocl->context); + return(1); + } + + ocl->mem_constant = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY, HLTCA_GPU_TRACKER_CONSTANT_MEM, NULL, &ocl_error); + if (ocl_error != CL_SUCCESS) + { + HLTError("OPENCL Constant Memory Allocation Error"); + clReleaseMemObject(ocl->mem_gpu); + clReleaseContext(ocl->context); + return(1); + } + + int nStreams = CAMath::Max(3, fSliceCount); + if (nStreams > 36) + { + HLTError("Uhhh, more than 36 command queues requested, cannot do this. Did the TPC become larger?"); + return(1); + } + for (int i = 0;i < nStreams;i++) + { + ocl->command_queue[i] = clCreateCommandQueue(ocl->context, ocl->device, 0, &ocl_error); + if (ocl_error != CL_SUCCESS) quit("Error creating OpenCL command queue"); + } + if (clEnqueueMigrateMemObjects(ocl->command_queue[0], 1, &ocl->mem_gpu, 0, 0, NULL, NULL) != CL_SUCCESS) quit("Error migrating buffer"); + + if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize); + int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY; + + ocl->mem_host = clCreateBuffer(ocl->context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, hostMemSize, NULL, &ocl_error); + if (ocl_error != CL_SUCCESS) quit("Error allocating pinned host memory"); + + const char* krnlGetPtr = "__kernel void krnlGetPtr(__global char* gpu_mem, __global size_t* host_mem) {if (get_global_id(0) == 0) *host_mem = (size_t) gpu_mem;}"; + cl_program program = clCreateProgramWithSource(ocl->context, 1, (const char**) &krnlGetPtr, NULL, &ocl_error); + if (ocl_error != CL_SUCCESS) quit("Error creating program object"); + ocl_error = clBuildProgram(program, 1, &ocl->device, "", NULL, NULL); + if (ocl_error != CL_SUCCESS) + { + char build_log[16384]; + clGetProgramBuildInfo(program, ocl->device, CL_PROGRAM_BUILD_LOG, 16384, build_log, NULL); + HLTImportant("Build Log:\n\n%s\n\n", build_log); + quit("Error compiling program"); + } + cl_kernel kernel = clCreateKernel(program, "krnlGetPtr", &ocl_error); + if (ocl_error != CL_SUCCESS) quit("Error creating kernel"); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &ocl->mem_gpu); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &ocl->mem_host); + size_t local_size = 16, global_size = 16; + if (clEnqueueNDRangeKernel(ocl->command_queue[0], kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL) != CL_SUCCESS) quit("Error executing kernel"); + clFinish(ocl->command_queue[0]); + clReleaseKernel(kernel); + clReleaseProgram(program); + + if (fDebugLevel >= 2) HLTInfo("Mapping hostmemory"); + ocl->mem_host_ptr = clEnqueueMapBuffer(ocl->command_queue[0], ocl->mem_host, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, hostMemSize, 0, NULL, NULL, &ocl_error); + if (ocl_error != CL_SUCCESS) + { + HLTError("Error allocating Page Locked Host Memory"); + return(1); + } + fHostLockedMemory = ocl->mem_host_ptr; + if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize); + fGPUMergerHostMemory = ((char*) fHostLockedMemory) + hostMemSize - fGPUMergerMaxMemory; + + if (fDebugLevel >= 2) HLTInfo("Obtained Pointer to GPU Memory: %p", *((void**) ocl->mem_host_ptr)); + fGPUMemory = *((void**) ocl->mem_host_ptr); + fGPUMergerMemory = ((char*) fGPUMemory) + fGPUMemSize - fGPUMergerMaxMemory; + + if (fDebugLevel >= 1) + { + memset(ocl->mem_host_ptr, 0, hostMemSize); + } + + ocl->selector_events = new cl_event[fSliceCount]; + + HLTImportant("OPENCL Initialisation successfull (%d: %s %s (Frequency %d, Shaders %d) Thread %d, Max slices: %d)", bestDevice, device_vendor, device_name, (int) freq, (int) shaders, fThreadId, fSliceCount); + + return(0); +} + +static const char* opencl_error_string(int errorcode) +{ + switch (errorcode) + { + case CL_SUCCESS: return "Success!"; + case CL_DEVICE_NOT_FOUND: return "Device not found."; + case CL_DEVICE_NOT_AVAILABLE: return "Device not available"; + case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure"; + case CL_OUT_OF_RESOURCES: return "Out of resources"; + case CL_OUT_OF_HOST_MEMORY: return "Out of host memory"; + case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available"; + case CL_MEM_COPY_OVERLAP: return "Memory copy overlap"; + case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported"; + case CL_BUILD_PROGRAM_FAILURE: return "Program build failure"; + case CL_MAP_FAILURE: return "Map failure"; + case CL_INVALID_VALUE: return "Invalid value"; + case CL_INVALID_DEVICE_TYPE: return "Invalid device type"; + case CL_INVALID_PLATFORM: return "Invalid platform"; + case CL_INVALID_DEVICE: return "Invalid device"; + case CL_INVALID_CONTEXT: return "Invalid context"; + case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties"; + case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue"; + case CL_INVALID_HOST_PTR: return "Invalid host pointer"; + case CL_INVALID_MEM_OBJECT: return "Invalid memory object"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor"; + case CL_INVALID_IMAGE_SIZE: return "Invalid image size"; + case CL_INVALID_SAMPLER: return "Invalid sampler"; + case CL_INVALID_BINARY: return "Invalid binary"; + case CL_INVALID_BUILD_OPTIONS: return "Invalid build options"; + case CL_INVALID_PROGRAM: return "Invalid program"; + case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable"; + case CL_INVALID_KERNEL_NAME: return "Invalid kernel name"; + case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition"; + case CL_INVALID_KERNEL: return "Invalid kernel"; + case CL_INVALID_ARG_INDEX: return "Invalid argument index"; + case CL_INVALID_ARG_VALUE: return "Invalid argument value"; + case CL_INVALID_ARG_SIZE: return "Invalid argument size"; + case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments"; + case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension"; + case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size"; + case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size"; + case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset"; + case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list"; + case CL_INVALID_EVENT: return "Invalid event"; + case CL_INVALID_OPERATION: return "Invalid operation"; + case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object"; + case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size"; + case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level"; + default: return "Unknown Errorcode"; + } +} + + +bool AliHLTTPCCAGPUTrackerOpenCL::GPUFailedMsgA(int error, const char* file, int line) +{ + //Check for OPENCL Error and in the case of an error display the corresponding error string + if (error == CL_SUCCESS) return(false); + HLTWarning("OCL Error: %d / %s (%s:%d)", error, opencl_error_string(error), file, line); + return(true); +} + +int AliHLTTPCCAGPUTrackerOpenCL::GPUSync(char* state, int stream, int slice) +{ + //Wait for OPENCL-Kernel to finish and check for OPENCL errors afterwards + + if (fDebugLevel == 0) return(0); + for (int i = 0;i < fSliceCount;i++) + { + if (stream != -1) i = stream; + clFinish(ocl->command_queue[i]); + if (stream != -1) break; + } + if (fDebugLevel >= 3) HLTInfo("OPENCL Sync Done"); + return(0); +} + +template static inline cl_int clSetKernelArgA(cl_kernel krnl, cl_uint num, T arg) +{ + return(clSetKernelArg(krnl, num, sizeof(T), &arg)); +} + +static inline cl_int clExecuteKernelA(cl_command_queue queue, cl_kernel krnl, size_t local_size, size_t global_size, cl_event* pEvent) +{ + return(clEnqueueNDRangeKernel(queue, krnl, 1, NULL, &global_size, &local_size, 0, NULL, pEvent)); +} + +int AliHLTTPCCAGPUTrackerOpenCL::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal) +{ + //Primary reconstruction function + + if (Reconstruct_Base_Init(pOutput, pClusterData, firstSlice, sliceCountLocal)) return(1); + + //Copy Tracker Object to GPU Memory + if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU"); + + GPUFailedMsg(clEnqueueWriteBuffer(ocl->command_queue[0], ocl->mem_constant, CL_FALSE, 0, sizeof(AliHLTTPCCATracker) * sliceCountLocal, fGpuTracker, 0, NULL, NULL)); + + if (GPUSync("Initialization (1)", 0, firstSlice) RANDOM_ERROR) + { + ResetHelperThreads(0); + return(1); + } + + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) + { + if (Reconstruct_Base_SliceInit(pClusterData, iSlice, firstSlice)) return(1); + + //Initialize temporary memory where needed + if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory"); + clSetKernelArgA(ocl->kernel_row_blocks, 0, ocl->mem_gpu); + clSetKernelArgA(ocl->kernel_row_blocks, 1, ocl->mem_constant); + clSetKernelArgA(ocl->kernel_row_blocks, 2, iSlice); + clExecuteKernelA(ocl->command_queue[2], ocl->kernel_row_blocks, HLTCA_GPU_THREAD_COUNT, HLTCA_GPU_THREAD_COUNT * fConstructorBlockCount, NULL); + if (GPUSync("Initialization (2)", 2, iSlice + firstSlice) RANDOM_ERROR) + { + ResetHelperThreads(1); + return(1); + } + + //Copy Data to GPU Global Memory + GPUFailedMsg(clEnqueueWriteBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[iSlice].CommonMemory() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), 0, NULL, NULL)); + GPUFailedMsg(clEnqueueWriteBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[iSlice].Data().Memory() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), 0, NULL, NULL)); + GPUFailedMsg(clEnqueueWriteBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[iSlice].SliceDataRows() - (char*) fGPUMemory, (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), 0, NULL, NULL)); + + if (fDebugLevel >= 4) + { + if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory"); + fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS, fConstructorBlockCount); + fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() ); + } + + if (GPUSync("Initialization (3)", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) + { + ResetHelperThreads(1); + return(1); + } + StandalonePerfTime(firstSlice + iSlice, 1); + + if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal); + clSetKernelArgA(ocl->kernel_neighbours_finder, 0, ocl->mem_gpu); + clSetKernelArgA(ocl->kernel_neighbours_finder, 1, ocl->mem_constant); + clSetKernelArgA(ocl->kernel_neighbours_finder, 2, iSlice); + clExecuteKernelA(ocl->command_queue[iSlice & 1], ocl->kernel_neighbours_finder, HLTCA_GPU_THREAD_COUNT_FINDER, HLTCA_GPU_THREAD_COUNT_FINDER * fSlaveTrackers[firstSlice + iSlice].Param().NRows(), NULL); + + if (GPUSync("Neighbours finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) + { + ResetHelperThreads(1); + return(1); + } + + StandalonePerfTime(firstSlice + iSlice, 2); + + if (fDebugLevel >= 4) + { + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].Data().Memory() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), 0, NULL, NULL)); + if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile); + } + + if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal); + clSetKernelArgA(ocl->kernel_neighbours_cleaner, 0, ocl->mem_gpu); + clSetKernelArgA(ocl->kernel_neighbours_cleaner, 1, ocl->mem_constant); + clSetKernelArgA(ocl->kernel_neighbours_cleaner, 2, iSlice); + clExecuteKernelA(ocl->command_queue[iSlice & 1], ocl->kernel_neighbours_cleaner, HLTCA_GPU_THREAD_COUNT, HLTCA_GPU_THREAD_COUNT * (fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 2), NULL); + if (GPUSync("Neighbours Cleaner", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) + { + ResetHelperThreads(1); + return(1); + } + + StandalonePerfTime(firstSlice + iSlice, 3); + + if (fDebugLevel >= 4) + { + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].Data().Memory() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), 0, NULL, NULL)); + if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile); + } + + if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal); + clSetKernelArgA(ocl->kernel_start_hits_finder, 0, ocl->mem_gpu); + clSetKernelArgA(ocl->kernel_start_hits_finder, 1, ocl->mem_constant); + clSetKernelArgA(ocl->kernel_start_hits_finder, 2, iSlice); + clExecuteKernelA(ocl->command_queue[iSlice & 1], ocl->kernel_start_hits_finder, HLTCA_GPU_THREAD_COUNT, HLTCA_GPU_THREAD_COUNT * (fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 6), NULL); + + if (GPUSync("Start Hits Finder", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) + { + ResetHelperThreads(1); + return(1); + } + + StandalonePerfTime(firstSlice + iSlice, 4); + + if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal); + clSetKernelArgA(ocl->kernel_start_hits_sorter, 0, ocl->mem_gpu); + clSetKernelArgA(ocl->kernel_start_hits_sorter, 1, ocl->mem_constant); + clSetKernelArgA(ocl->kernel_start_hits_sorter, 2, iSlice); + clExecuteKernelA(ocl->command_queue[iSlice & 1], ocl->kernel_start_hits_sorter, HLTCA_GPU_THREAD_COUNT, HLTCA_GPU_THREAD_COUNT * fConstructorBlockCount, NULL); + if (GPUSync("Start Hits Sorter", iSlice & 1, iSlice + firstSlice) RANDOM_ERROR) + { + ResetHelperThreads(1); + return(1); + } + + StandalonePerfTime(firstSlice + iSlice, 5); + + if (fDebugLevel >= 2) + { + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].CommonMemory() - (char*) fGPUMemory, fGpuTracker[iSlice].CommonMemorySize(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), 0, NULL, NULL) RANDOM_ERROR); + if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice); + if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS RANDOM_ERROR) + { + HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant"); + ResetHelperThreads(1); + return(1); + } + } + + if (fDebugLevel >= 4 && *fSlaveTrackers[firstSlice + iSlice].NTracklets()) + { +#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].TrackletTmpStartHits() - (char*) fGPUMemory, pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), 0, NULL, NULL)); + if (fDebugMask & 8) + { + *fOutFile << "Temporary "; + fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile); + } + uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows()); + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].RowStartHitCountOffset() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), tmpMemory, 0, NULL, NULL)); + if (fDebugMask & 16) + { + *fOutFile << "Start Hits Sort Vector:" << std::endl; + for (int i = 1;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows() - 5;i++) + { + *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl; + } + } + free(tmpMemory); +#endif + + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[iSlice & 1], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].HitMemory() - (char*) fGPUMemory, fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), fSlaveTrackers[firstSlice + iSlice].HitMemory(), 0, NULL, NULL)); + if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile); + } + + StandalonePerfTime(firstSlice + iSlice, 6); + + fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters()); + } + + for (int i = 0;i < fNHelperThreads;i++) + { + pthread_mutex_lock(&((pthread_mutex_t*) fHelperParams[i].fMutex)[1]); + } + + StandalonePerfTime(firstSlice, 7); + + if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor"); + for (int i = 0;i < 3;i++) clFinish(ocl->command_queue[i]); + clSetKernelArgA(ocl->kernel_tracklet_constructor, 0, ocl->mem_gpu); + clSetKernelArgA(ocl->kernel_tracklet_constructor, 1, ocl->mem_constant); + clExecuteKernelA(ocl->command_queue[0], ocl->kernel_tracklet_constructor, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR * fConstructorBlockCount, NULL); + if (GPUSync("Tracklet Constructor", 0, firstSlice) RANDOM_ERROR) + { + SynchronizeGPU(); + return(1); + } + clFinish(ocl->command_queue[0]); + + StandalonePerfTime(firstSlice, 8); + + if (fDebugLevel >= 4) + { + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) + { + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[0], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].CommonMemory() - (char*) fGPUMemory, fGpuTracker[iSlice].CommonMemorySize(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), 0, NULL, NULL)); + if (fDebugLevel >= 5) + { + HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets()); + } + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[0], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].TrackletMemory() - (char*) fGPUMemory, fGpuTracker[iSlice].TrackletMemorySize(), fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), 0, NULL, NULL)); + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[0], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].HitMemory() - (char*) fGPUMemory, fGpuTracker[iSlice].HitMemorySize(), fSlaveTrackers[firstSlice + iSlice].HitMemory(), 0, NULL, NULL)); + if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile); + } + } + + int runSlices = 0; + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices) + { + if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++; + if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices); + clSetKernelArgA(ocl->kernel_tracklet_selector, 0, ocl->mem_gpu); + clSetKernelArgA(ocl->kernel_tracklet_selector, 1, ocl->mem_constant); + clSetKernelArgA(ocl->kernel_tracklet_selector, 2, iSlice); + clSetKernelArgA(ocl->kernel_tracklet_selector, 3, (int) CAMath::Min(runSlices, sliceCountLocal - iSlice)); + clExecuteKernelA(ocl->command_queue[iSlice], ocl->kernel_tracklet_selector, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR * fConstructorBlockCount, NULL); + if (GPUSync("Tracklet Selector", iSlice, iSlice + firstSlice) RANDOM_ERROR) + { + SynchronizeGPU(); + return(1); + } + } + StandalonePerfTime(firstSlice, 9); + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) + { + clEnqueueMarkerWithWaitList(ocl->command_queue[iSlice], 0, NULL, &ocl->selector_events[iSlice]); + } + + char *tmpMemoryGlobalTracking = NULL; + fSliceOutputReady = 0; + + if (Reconstruct_Base_StartGlobal(pOutput, tmpMemoryGlobalTracking)) return(1); + + int tmpSlice = 0, tmpSlice2 = 0; + for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++) + { + if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host"); + cl_int eventdone; + + if (tmpSlice < sliceCountLocal) GPUFailedMsg(clGetEventInfo(ocl->selector_events[tmpSlice], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(eventdone), &eventdone, NULL)); + while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || eventdone == CL_COMPLETE)) + { + clReleaseEvent(ocl->selector_events[tmpSlice]); + if (GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[tmpSlice], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[tmpSlice].CommonMemory() - (char*) fGPUMemory, fGpuTracker[tmpSlice].CommonMemorySize(), fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), 0, NULL, &ocl->selector_events[tmpSlice]) RANDOM_ERROR)) + { + HLTImportant("Error transferring tracks from GPU to host"); + ResetHelperThreads(1); + ActivateThreadContext(); + return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal)); + } + tmpSlice++; + if (tmpSlice < sliceCountLocal) GPUFailedMsg(clGetEventInfo(ocl->selector_events[tmpSlice], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(eventdone), &eventdone, NULL)); + } + + if (tmpSlice2 < tmpSlice) GPUFailedMsg(clGetEventInfo(ocl->selector_events[tmpSlice2], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(eventdone), &eventdone, NULL)); + while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? (clFinish(ocl->command_queue[tmpSlice2]) == CL_SUCCESS) : (eventdone == CL_COMPLETE))) + { + if (*fSlaveTrackers[firstSlice + tmpSlice2].NTracks() > 0) + { + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[tmpSlice2], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[tmpSlice2].Tracks() - (char*) fGPUMemory, sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), 0, NULL, NULL)); + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[tmpSlice2], ocl->mem_gpu, CL_FALSE, (char*) fGpuTracker[tmpSlice2].TrackHits() - (char*) fGPUMemory, sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), 0, NULL, NULL)); + } + tmpSlice2++; + if (tmpSlice2 < tmpSlice) GPUFailedMsg(clGetEventInfo(ocl->selector_events[tmpSlice2], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(eventdone), &eventdone, NULL)); + } + + if (GPUFailedMsg(clFinish(ocl->command_queue[iSlice])) RANDOM_ERROR) + { + ResetHelperThreads(1); + ActivateThreadContext(); + for (int iSlice2 = 0;iSlice2 < sliceCountLocal;iSlice2++) clReleaseEvent(ocl->selector_events[iSlice2]); + return(SelfHealReconstruct(pOutput, pClusterData, firstSlice, sliceCountLocal)); + } + + if (fDebugLevel >= 4) + { + SynchronizeGPU(); +#ifndef BITWISE_COMPATIBLE_DEBUG_OUTPUT + //GPUFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost)); + GPUFailedMsg(clEnqueueReadBuffer(ocl->command_queue[0], ocl->mem_gpu, CL_TRUE, (char*) fGpuTracker[iSlice].TrackletMemory() - (char*) fGPUMemory, fGpuTracker[iSlice].TrackletMemorySize(), fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), 0, NULL, NULL)); + if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile); +#endif + if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile); + } + + + if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError RANDOM_ERROR) + { + HLTError("GPU Tracker returned Error Code %d in slice %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError, firstSlice + iSlice); + ResetHelperThreads(1); + for (int iSlice2 = 0;iSlice2 < sliceCountLocal;iSlice2++) clReleaseEvent(ocl->selector_events[iSlice2]); + return(1); + } + if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits()); + + if (Reconstruct_Base_FinishSlices(pOutput, iSlice, firstSlice)) return(1); + } + for (int iSlice2 = 0;iSlice2 < sliceCountLocal;iSlice2++) clReleaseEvent(ocl->selector_events[iSlice2]); + + if (Reconstruct_Base_Finalize(pOutput, tmpMemoryGlobalTracking, firstSlice)) return(1); + + return(0); +} + +int AliHLTTPCCAGPUTrackerOpenCL::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal) +{ + HLTFatal("Not implemented in OpenCL (ReconstructPP)"); + return(1); +} + +int AliHLTTPCCAGPUTrackerOpenCL::ExitGPU_Runtime() +{ + //Uninitialize OPENCL + + const int nStreams = CAMath::Max(3, fSliceCount); + for (int i = 0;i < nStreams;i++) clFinish(ocl->command_queue[i]); + + if (fGPUMemory) + { + clReleaseMemObject(ocl->mem_gpu); + clReleaseMemObject(ocl->mem_constant); + fGPUMemory = NULL; + + clReleaseKernel(ocl->kernel_neighbours_finder); + clReleaseKernel(ocl->kernel_neighbours_cleaner); + clReleaseKernel(ocl->kernel_start_hits_finder); + clReleaseKernel(ocl->kernel_start_hits_sorter); + clReleaseKernel(ocl->kernel_tracklet_constructor); + clReleaseKernel(ocl->kernel_tracklet_selector); + clReleaseKernel(ocl->kernel_row_blocks); + } + if (fHostLockedMemory) + { + clEnqueueUnmapMemObject(ocl->command_queue[0], ocl->mem_host, ocl->mem_host_ptr, 0, NULL, NULL); + ocl->mem_host_ptr = NULL; + for (int i = 0;i < nStreams;i++) + { + clReleaseCommandQueue(ocl->command_queue[i]); + } + clReleaseMemObject(ocl->mem_host); + fGpuTracker = NULL; + fHostLockedMemory = NULL; + } + + if (ocl->selector_events) + { + delete[] ocl->selector_events; + ocl->selector_events = NULL; + } + if (ocl->devices) + { + delete[] ocl->devices; + ocl->devices = NULL; + } + + clReleaseProgram(ocl->program); + clReleaseContext(ocl->context); + + HLTInfo("OPENCL Uninitialized"); + fCudaInitialized = 0; + return(0); +} + +int AliHLTTPCCAGPUTrackerOpenCL::RefitMergedTracks(AliHLTTPCGMMerger* Merger) +{ + HLTFatal("Not implemented in OpenCL (Merger)"); + return(1); +} + +void AliHLTTPCCAGPUTrackerOpenCL::ActivateThreadContext() +{ +} + +void AliHLTTPCCAGPUTrackerOpenCL::ReleaseThreadContext() +{ +} + +void AliHLTTPCCAGPUTrackerOpenCL::SynchronizeGPU() +{ + const int nStreams = CAMath::Max(3, fSliceCount); + for (int i = 0;i < nStreams;i++) clFinish(ocl->command_queue[i]); +} + +AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate() +{ + return new AliHLTTPCCAGPUTrackerOpenCL; +} + +void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr) +{ + delete ptr; +} diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.h new file mode 100644 index 00000000000..88dfcb0cb8a --- /dev/null +++ b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCL.h @@ -0,0 +1,65 @@ +//-*- Mode: C++ -*- +// $Id$ + +// ************************************************************************ +// This file is property of and copyright by the ALICE HLT Project * +// ALICE Experiment at CERN, All rights reserved. * +// See cxx source for full Copyright notice * +// * +//************************************************************************* + +// @file AliHLTTPCCAGPUTrackerOpenCL.h +// @author David Rohr, Sergey Gorbunov +// @date +// @brief TPC CA Tracker for the NVIDIA GPU +// @note + + +#ifndef ALIHLTTPCCAGPUTRACKEROPENCL_H +#define ALIHLTTPCCAGPUTRACKEROPENCL_H + +#include "AliHLTTPCCAGPUTrackerBase.h" + +struct AliHLTTPCCAGPUTrackerOpenCLInternals; + +class AliHLTTPCCAGPUTrackerOpenCL : public AliHLTTPCCAGPUTrackerBase +{ +public: + AliHLTTPCCAGPUTrackerOpenCL(); + virtual ~AliHLTTPCCAGPUTrackerOpenCL(); + + virtual int InitGPU_Runtime(int sliceCount = -1, int forceDeviceID = -1); + virtual int Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1); + virtual int ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1); + virtual int ExitGPU_Runtime(); + virtual int RefitMergedTracks(AliHLTTPCGMMerger* Merger); + +protected: + virtual void ActivateThreadContext(); + virtual void ReleaseThreadContext(); + virtual void SynchronizeGPU(); + virtual int GPUSync(char* state = "UNKNOWN", int sliceLocal = 0, int slice = 0); + +private: + void DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check = true); + bool GPUFailedMsgA(int, const char* file, int line); + AliHLTTPCCAGPUTrackerOpenCLInternals* ocl; + + + // disable copy + AliHLTTPCCAGPUTrackerOpenCL( const AliHLTTPCCAGPUTrackerOpenCL& ); + AliHLTTPCCAGPUTrackerOpenCL &operator=( const AliHLTTPCCAGPUTrackerOpenCL& ); + + ClassDef( AliHLTTPCCAGPUTrackerOpenCL, 0 ) +}; + +#ifdef R__WIN32 +#define DLL_EXPORT __declspec(dllexport) +#else +#define DLL_EXPORT +#endif + +extern "C" DLL_EXPORT AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate(); +extern "C" DLL_EXPORT void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr); + +#endif //ALIHLTTPCCAGPUTRACKER_H diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCLInternals.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCLInternals.h new file mode 100644 index 00000000000..537a6285fd2 --- /dev/null +++ b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCAGPUTrackerOpenCLInternals.h @@ -0,0 +1,40 @@ +//-*- Mode: C++ -*- +// $Id$ + +// ************************************************************************ +// This file is property of and copyright by the ALICE HLT Project * +// ALICE Experiment at CERN, All rights reserved. * +// See cxx source for full Copyright notice * +// * +//************************************************************************* + +// @file AliHLTTPCCAGPUTrackerOpenCL.h +// @author David Rohr, Sergey Gorbunov +// @date +// @brief TPC CA Tracker for the NVIDIA GPU +// @note + + +#ifndef ALIHLTTPCCAGPUTRACKEROPENCLINTERNALS_H +#define ALIHLTTPCCAGPUTRACKEROPENCLINTERNALS_H + +#include +#include + +struct AliHLTTPCCAGPUTrackerOpenCLInternals +{ + cl_device_id device; + cl_device_id* devices; + cl_context context; + cl_command_queue command_queue[36]; + cl_mem mem_gpu; + cl_mem mem_constant; + cl_mem mem_host; + void* mem_host_ptr; + cl_event* selector_events; + cl_program program; + + cl_kernel kernel_neighbours_finder, kernel_neighbours_cleaner, kernel_start_hits_finder, kernel_start_hits_sorter, kernel_tracklet_constructor, kernel_tracklet_selector, kernel_row_blocks; +}; + +#endif \ No newline at end of file diff --git a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCATrackletConstructorGPU.h b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCATrackletConstructorGPU.h index ec0a55062c8..ab171119d3c 100755 --- a/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCATrackletConstructorGPU.h +++ b/HLT/TPCLib/tracking-ca/cagpu/AliHLTTPCCATrackletConstructorGPU.h @@ -1,6 +1,6 @@ #include "AliHLTTPCCAGPUConfig.h" -GPUdi() void AliHLTTPCCATrackletConstructor::CopyTrackletTempData( AliHLTTPCCAThreadMemory &rMemSrc, AliHLTTPCCAThreadMemory &rMemDst, AliHLTTPCCATrackParam &tParamSrc, AliHLTTPCCATrackParam &tParamDst) +MEM_TEMPLATE4() GPUdi() void AliHLTTPCCATrackletConstructor::CopyTrackletTempData( MEM_TYPE(AliHLTTPCCAThreadMemory) &rMemSrc, MEM_TYPE2(AliHLTTPCCAThreadMemory) &rMemDst, MEM_TYPE3(AliHLTTPCCATrackParam) &tParamSrc, MEM_TYPE4(AliHLTTPCCATrackParam) &tParamDst) { //Copy Temporary Tracklet data from registers to global mem and vice versa rMemDst.fStartRow = rMemSrc.fStartRow; @@ -48,20 +48,20 @@ GPUdi() void AliHLTTPCCATrackletConstructor::CopyTrackletTempData( AliHLTTPCCATh } #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER -GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, int Reverse, int RowBlock, int &mustInit) +GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker, GPUshared() MEM_LOCAL(AliHLTTPCCASharedMemory) &sMem, int Reverse, int RowBlock, int &mustInit) { //Fetch a new trackled to be processed by this thread - __syncthreads(); + GPUsync(); int nextTrackletFirstRun = sMem.fNextTrackletFirstRun; - if (threadIdx.x == 0) + if (get_local_id(0) == 0) { sMem.fNTracklets = *tracker.NTracklets(); if (sMem.fNextTrackletFirstRun) { #ifdef HLTCA_GPU_SCHED_FIXED_START - const int iSlice = tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + (gridDim.x % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % gridDim.x != 0)) / gridDim.x; - const int nSliceBlockOffset = gridDim.x * iSlice / tracker.GPUParametersConst()->fGPUnSlices; - const uint2 &nTracklet = tracker.BlockStartingTracklet()[blockIdx.x - nSliceBlockOffset]; + const int iSlice = tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + (get_num_groups(0) % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0); + const int nSliceBlockOffset = get_num_groups(0) * iSlice / tracker.GPUParametersConst()->fGPUnSlices; + const uint2 &nTracklet = tracker.BlockStartingTracklet()[get_group_id(0) - nSliceBlockOffset]; sMem.fNextTrackletCount = nTracklet.y; if (sMem.fNextTrackletCount == 0) @@ -84,7 +84,7 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr else { const int4 oldPos = *tracker.RowBlockPos(Reverse, RowBlock); - const int nFetchTracks = CAMath::Max(CAMath::Min(oldPos.x - oldPos.y, HLTCA_GPU_THREAD_COUNT), 0); + const int nFetchTracks = CAMath::Max(CAMath::Min(oldPos.x - oldPos.y, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR), 0); sMem.fNextTrackletCount = nFetchTracks; const int nUseTrack = nFetchTracks ? CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).y, nFetchTracks) : 0; sMem.fNextTrackletFirst = nUseTrack; @@ -99,30 +99,30 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr } for (int i = 0;i < nFillTracks;i++) { - tracker.RowBlockTracklets(Reverse, RowBlock)[(nStartFillTrack + i) % HLTCA_GPU_MAX_TRACKLETS] = -(blockIdx.x * 1000000 + nFetchTracks * 10000 + oldPos.x * 100 + oldPos.y); //Dummy filling track + tracker.RowBlockTracklets(Reverse, RowBlock)[(nStartFillTrack + i) % HLTCA_GPU_MAX_TRACKLETS] = -(get_group_id(0) * 1000000 + nFetchTracks * 10000 + oldPos.x * 100 + oldPos.y); //Dummy filling track } } } } - __syncthreads(); + GPUsync(); mustInit = 0; if (sMem.fNextTrackletCount == 0) { return(-2); //No more track in this RowBlock } - else if (threadIdx.x >= sMem.fNextTrackletCount) + else if (get_local_id(0) >= sMem.fNextTrackletCount) { return(-1); //No track in this RowBlock for this thread } else if (nextTrackletFirstRun) { - if (threadIdx.x == 0) sMem.fNextTrackletFirstRun = 0; + if (get_local_id(0) == 0) sMem.fNextTrackletFirstRun = 0; mustInit = 1; - return(sMem.fNextTrackletFirst + threadIdx.x); + return(sMem.fNextTrackletFirst + get_local_id(0)); } else { - const int nTrackPos = sMem.fNextTrackletFirst + threadIdx.x; + const int nTrackPos = sMem.fNextTrackletFirst + get_local_id(0); mustInit = (nTrackPos < tracker.RowBlockPos(Reverse, RowBlock)->w); volatile int* const ptrTracklet = &tracker.RowBlockTracklets(Reverse, RowBlock)[nTrackPos % HLTCA_GPU_MAX_TRACKLETS]; int nTracklet; @@ -142,31 +142,31 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr } } -GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(AliHLTTPCCATracker *pTracker) +MEM_CLASS_PRE2 GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(MEM_LG2(AliHLTTPCCATracker) *pTracker, GPUsharedref() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory)& sMem) { //Main Tracklet construction function that calls the scheduled (FetchTracklet) and then Processes the tracklet (mainly UpdataTracklet) and at the end stores the tracklet. //Can also dispatch a tracklet to be rescheduled #ifdef HLTCA_GPU_EMULATION_SINGLE_TRACKLET pTracker[0].BlockStartingTracklet()[0].x = HLTCA_GPU_EMULATION_SINGLE_TRACKLET; pTracker[0].BlockStartingTracklet()[0].y = 1; - for (int i = 1;i < gridDim.x;i++) + for (int i = 1;i < get_num_groups(0);i++) { pTracker[0].BlockStartingTracklet()[i].x = pTracker[0].BlockStartingTracklet()[i].y = 0; } #endif //HLTCA_GPU_EMULATION_SINGLE_TRACKLET - GPUshared() AliHLTTPCCASharedMemory sMem; + //GPUshared() AliHLTTPCCASharedMemory sMem; #ifdef HLTCA_GPU_SCHED_FIXED_START - if (threadIdx.x == 0) + if (get_local_id(0) == 0) { sMem.fNextTrackletFirstRun = 1; } - __syncthreads(); + GPUsync(); #endif //HLTCA_GPU_SCHED_FIXED_START #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE - if (threadIdx.x == 0) + if (get_local_id(0) == 0) { sMem.fMaxSync = 0; } @@ -178,13 +178,13 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A for (volatile int iRowBlock = 0;iRowBlock < HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1;iRowBlock++) { #ifdef HLTCA_GPU_SCHED_FIXED_SLICE - int iSlice = pTracker[0].GPUParametersConst()->fGPUnSlices * (blockIdx.x + (gridDim.x % pTracker[0].GPUParametersConst()->fGPUnSlices != 0 && pTracker[0].GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % gridDim.x != 0)) / gridDim.x; + int iSlice = pTracker[0].GPUParametersConst()->fGPUnSlices * (get_group_id(0) + (get_num_groups(0) % pTracker[0].GPUParametersConst()->fGPUnSlices != 0 && pTracker[0].GPUParametersConst()->fGPUnSlices * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0); #else for (int iSlice = 0;iSlice < pTracker[0].GPUParametersConst()->fGPUnSlices;iSlice++) #endif //HLTCA_GPU_SCHED_FIXED_SLICE { AliHLTTPCCATracker &tracker = pTracker[iSlice]; - if (blockIdx.x != 7 && sMem.fNextTrackletFirstRun && iSlice != (tracker.GPUParametersConst()->fGPUnSlices > gridDim.x ? blockIdx.x : (tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + (gridDim.x % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % gridDim.x != 0)) / gridDim.x))) + if (get_group_id(0) != 7 && sMem.fNextTrackletFirstRun && iSlice != (tracker.GPUParametersConst()->fGPUnSlices > get_num_groups(0) ? get_group_id(0) : (tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + (get_num_groups(0) % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0)))) { continue; } @@ -196,13 +196,13 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A while ((iTracklet = FetchTracklet(tracker, sMem, iReverse, iRowBlock, mustInit)) != -2) { #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE - CAMath::AtomicMax(&sMem.fMaxSync, threadSync); - __syncthreads(); - threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / blockDim.x / gridDim.x); + CAMath::AtomicMaxShared(&sMem.fMaxSync, threadSync); + GPUsync(); + threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / get_local_size(0) / get_num_groups(0)); #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE if (!sharedRowsInitialized) { - for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x) + for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += get_local_size(0)) { reinterpret_cast(&sMem.fRows)[i] = reinterpret_cast(tracker.SliceDataRows())[i]; } @@ -211,16 +211,16 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A #ifdef HLTCA_GPU_RESCHED short2 storeToRowBlock; int storePosition = 0; - if (threadIdx.x < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1)) + if (get_local_id(0) < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1)) { - const int nReverse = threadIdx.x / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); - const int nRowBlock = threadIdx.x % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); + const int nReverse = get_local_id(0) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); + const int nRowBlock = get_local_id(0) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); sMem.fTrackletStoreCount[nReverse][nRowBlock] = 0; } #else mustInit = 1; #endif //HLTCA_GPU_RESCHED - __syncthreads(); + GPUsync(); AliHLTTPCCATrackParam tParam; AliHLTTPCCAThreadMemory rMem; @@ -260,11 +260,11 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A { #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(j >= rMem.fEndRow || ( j >= rMem.fStartRow && j - rMem.fStartRow % 2 == 0))) - pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1; + pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1; #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE if (iTracklet >= 0) { - UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j); + UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j); if (rMem.fNMissed > kMaxRowGap && j <= rMem.fStartRow) { rMem.fGo = 0; @@ -275,7 +275,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A if (iTracklet >= 0 && (!rMem.fGo || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP)) { - StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam ); + StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam ); } } else @@ -284,11 +284,11 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A { #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && j >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (j - rMem.fStartRow) % 2 == 0 )) - pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1; + pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1; #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE if (iTracklet >= 0) { - UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j); + UpdateTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j); //if (rMem.fNMissed > kMaxRowGap || rMem.fGo == 0) break; //DR!!! CUDA Crashes with this enabled } } @@ -309,49 +309,49 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A if (iTracklet >= 0 && !rMem.fGo) { - StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam ); + StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam ); } } if (rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0)) { CopyTrackletTempData( rMem, rMemGlobal, tParam, tParamGlobal ); - storePosition = CAMath::AtomicAdd(&sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x], 1); + storePosition = CAMath::AtomicAddShared(&sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x], 1); } - __syncthreads(); - if (threadIdx.x < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1)) + GPUsync(); + if (get_local_id(0) < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1)) { - const int nReverse = threadIdx.x / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); - const int nRowBlock = threadIdx.x % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); + const int nReverse = get_local_id(0) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); + const int nRowBlock = get_local_id(0) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1); if (sMem.fTrackletStoreCount[nReverse][nRowBlock]) { sMem.fTrackletStoreCount[nReverse][nRowBlock] = CAMath::AtomicAdd(&tracker.RowBlockPos(nReverse, nRowBlock)->x, sMem.fTrackletStoreCount[nReverse][nRowBlock]); } } - __syncthreads(); + GPUsync(); if (iTracklet >= 0 && rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0)) { tracker.RowBlockTracklets(storeToRowBlock.y, storeToRowBlock.x)[sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x] + storePosition] = iTracklet; } - __syncthreads(); + GPUsync(); #else - if (threadIdx.x % HLTCA_GPU_WARP_SIZE == 0) + if (get_local_id(0) % HLTCA_GPU_WARP_SIZE == 0) { - sMem.fStartRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 160; - sMem.fEndRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 0; + sMem.fStartRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 160; + sMem.fEndRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 0; } - __syncthreads(); + GPUsync(); if (iTracklet >= 0) { - CAMath::AtomicMin(&sMem.fStartRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fStartRow); + CAMath::AtomicMinShared(&sMem.fStartRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fStartRow); } - __syncthreads(); + GPUsync(); if (iTracklet >= 0) { - for (int j = sMem.fStartRows[threadIdx.x / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++) + for (int j = sMem.fStartRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++) { - UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j); + UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j); if (!rMem.fGo) break; } @@ -361,19 +361,19 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A { if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) ) rMem.fGo = 0; } - CAMath::AtomicMax(&sMem.fEndRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fEndRow); + CAMath::AtomicMaxShared(&sMem.fEndRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fEndRow); } - __syncthreads(); + GPUsync(); if (iTracklet >= 0) { for (int j = rMem.fEndRow;j >= 0;j--) { if (!rMem.fGo) break; - UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j); + UpdateTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j); } - StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam ); + StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam ); } #endif //HLTCA_GPU_RESCHED } @@ -424,33 +424,33 @@ GPUg() void AliHLTTPCCATrackletConstructorInit(int iSlice) { //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit AliHLTTPCCATracker &tracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[iSlice]; - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i = get_global_id(0); if (i >= *tracker.NTracklets()) return; AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit(i, tracker); } #elif defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER_SIMPLE) -GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, AliHLTTPCCAThreadMemory& /*rMem*/, AliHLTTPCCATrackParam& /*tParam*/) +GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker, GPUsharedref() MEM_LOCAL(AliHLTTPCCASharedMemory) &sMem, AliHLTTPCCAThreadMemory& /*rMem*/, MEM_PLAIN(AliHLTTPCCATrackParam)& /*tParam*/) { - const int nativeslice = blockIdx.x % tracker.GPUParametersConst()->fGPUnSlices; + const int nativeslice = get_group_id(0) % tracker.GPUParametersConst()->fGPUnSlices; const int nTracklets = *tracker.NTracklets(); - __syncthreads(); + GPUsync(); if (sMem.fNextTrackletFirstRun == 1) { - if (threadIdx.x == 0) + if (get_local_id(0) == 0) { - sMem.fNextTrackletFirst = (blockIdx.x - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT; + sMem.fNextTrackletFirst = (get_group_id(0) - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR; sMem.fNextTrackletFirstRun = 0; } } else { - if (threadIdx.x == 0) + if (get_local_id(0) == 0) { if (tracker.GPUParameters()->fNextTracklet < nTracklets) { - const int firstTracklet = CAMath::AtomicAdd(&tracker.GPUParameters()->fNextTracklet, HLTCA_GPU_THREAD_COUNT); + const int firstTracklet = CAMath::AtomicAdd(&tracker.GPUParameters()->fNextTracklet, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR); if (firstTracklet < nTracklets) sMem.fNextTrackletFirst = firstTracklet; else sMem.fNextTrackletFirst = -2; } @@ -460,28 +460,27 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr } } } - __syncthreads(); + GPUsync(); return (sMem.fNextTrackletFirst); } -GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(AliHLTTPCCATracker *pTracker) +GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) *pTracker, GPUsharedref() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory)& sMem) { const int nSlices = pTracker[0].GPUParametersConst()->fGPUnSlices; - const int nativeslice = blockIdx.x % nSlices; - GPUshared() AliHLTTPCCASharedMemory sMem; + const int nativeslice = get_group_id(0) % nSlices; int currentSlice = -1; - if (threadIdx.x) + if (get_local_id(0)) { sMem.fNextTrackletFirstRun = 1; } for (int iSlice = 0;iSlice < nSlices;iSlice++) { - AliHLTTPCCATracker &tracker = pTracker[(nativeslice + iSlice) % nSlices]; + GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker = pTracker[(nativeslice + iSlice) % nSlices]; int iRow, iRowEnd; - AliHLTTPCCATrackParam tParam; + MEM_PLAIN(AliHLTTPCCATrackParam) tParam; AliHLTTPCCAThreadMemory rMem; int tmpTracklet; @@ -489,7 +488,7 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A { if (tmpTracklet >= 0) { - rMem.fItr = tmpTracklet + threadIdx.x; + rMem.fItr = tmpTracklet + get_local_id(0); } else { @@ -498,17 +497,17 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A if (iSlice != currentSlice) { - if (threadIdx.x == 0) + if (get_local_id(0) == 0) { sMem.fNTracklets = *tracker.NTracklets(); } - for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x) + for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(MEM_PLAIN(AliHLTTPCCARow)) / sizeof(int);i += get_local_size(0)) { - reinterpret_cast(&sMem.fRows)[i] = reinterpret_cast(tracker.SliceDataRows())[i]; + reinterpret_cast(&sMem.fRows)[i] = reinterpret_cast(tracker.SliceDataRows())[i]; } currentSlice = iSlice; - __syncthreads(); + GPUsync(); } if (rMem.fItr < sMem.fNTracklets) @@ -567,41 +566,41 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A #else //HLTCA_GPU_ALTERNATIVE_SCHEDULER -GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, AliHLTTPCCAThreadMemory &rMem, AliHLTTPCCATrackParam &tParam) +GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker, GPUsharedref() MEM_LOCAL(AliHLTTPCCASharedMemory) &sMem, AliHLTTPCCAThreadMemory &rMem, MEM_PLAIN(AliHLTTPCCATrackParam) &tParam) { - const int nativeslice = blockIdx.x % tracker.GPUParametersConst()->fGPUnSlices; + const int nativeslice = get_group_id(0) % tracker.GPUParametersConst()->fGPUnSlices; const int nTracklets = *tracker.NTracklets(); - __syncthreads(); - if (threadIdx.x == 0) sMem.fTrackletStorePos = 0; + GPUsync(); + if (get_local_id(0) == 0) sMem.fTrackletStorePos = 0; int nStorePos = -1; if (sMem.fNextTrackletFirstRun == 1) { - if (threadIdx.x == 0) + if (get_local_id(0) == 0) { - sMem.fNextTrackletFirst = (blockIdx.x - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT; + sMem.fNextTrackletFirst = (get_group_id(0) - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR; sMem.fNextTrackletFirstRun = 0; - sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT; + sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR; } } else { - if (sMem.fNextTrackletCount < HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_ALTSCHED_MIN_THREADS) + if (sMem.fNextTrackletCount < HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR - HLTCA_GPU_ALTSCHED_MIN_THREADS) { - if (threadIdx.x == 0) + if (get_local_id(0) == 0) { sMem.fNextTrackletFirst = -1; } } else { - __syncthreads(); + GPUsync(); if (rMem.fItr != -1) { - nStorePos = CAMath::AtomicAdd(&sMem.fTrackletStorePos, 1); + nStorePos = CAMath::AtomicAddShared(&sMem.fTrackletStorePos, 1); CopyTrackletTempData(rMem, sMem.swapMemory[nStorePos].fThreadMem, tParam, sMem.swapMemory[nStorePos].fParam); rMem.fItr = -1; } - if (threadIdx.x == 0) + if (get_local_id(0) == 0) { if (tracker.GPUParameters()->fNextTracklet >= nTracklets) { @@ -623,12 +622,12 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr } } - if (threadIdx.x == 0) + if (get_local_id(0) == 0) { - if (sMem.fNextTrackletFirst == -1 && sMem.fNextTrackletCount == HLTCA_GPU_THREAD_COUNT) + if (sMem.fNextTrackletFirst == -1 && sMem.fNextTrackletCount == HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR) { sMem.fNextTrackletFirst = -2; - sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT; + sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR; } else if (sMem.fNextTrackletFirst >= 0) { @@ -642,28 +641,28 @@ GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tr } } } - __syncthreads(); - if (threadIdx.x < sMem.fTrackletStorePos) + GPUsync(); + if (get_local_id(0) < sMem.fTrackletStorePos) { - CopyTrackletTempData(sMem.swapMemory[threadIdx.x].fThreadMem, rMem, sMem.swapMemory[threadIdx.x].fParam, tParam); + CopyTrackletTempData(sMem.swapMemory[get_local_id(0)].fThreadMem, rMem, sMem.swapMemory[get_local_id(0)].fParam, tParam); } return (sMem.fNextTrackletFirst); } -GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(AliHLTTPCCATracker *pTracker) +GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) *pTracker, GPUsharedref() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory)& sMem) { const int nSlices = pTracker[0].GPUParametersConst()->fGPUnSlices; - const int nativeslice = blockIdx.x % nSlices; - GPUshared() AliHLTTPCCASharedMemory sMem; + const int nativeslice = get_group_id(0) % nSlices; + //GPUshared() AliHLTTPCCASharedMemory sMem; int currentSlice = -1; - if (threadIdx.x) + if (get_local_id(0)) { sMem.fNextTrackletFirstRun = 1; } #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE - if (threadIdx.x == 0) + if (get_local_id(0) == 0) { sMem.fMaxSync = 0; } @@ -672,9 +671,9 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A for (int iSlice = 0;iSlice < nSlices;iSlice++) { - AliHLTTPCCATracker &tracker = pTracker[(nativeslice + iSlice) % nSlices]; + GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker = pTracker[(nativeslice + iSlice) % nSlices]; - AliHLTTPCCATrackParam tParam; + MEM_PLAIN(AliHLTTPCCATrackParam) tParam; AliHLTTPCCAThreadMemory rMem; rMem.fItr = -1; @@ -683,26 +682,26 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A { #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE - CAMath::AtomicMax(&sMem.fMaxSync, threadSync); - __syncthreads(); - threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / blockDim.x / gridDim.x); + CAMath::AtomicMaxShared(&sMem.fMaxSync, threadSync); + GPUsync(); + threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / get_local_size(0) / get_num_groups(0)); #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE if (iSlice != currentSlice) { - if (threadIdx.x == 0) sMem.fNTracklets = *tracker.NTracklets(); + if (get_local_id(0) == 0) sMem.fNTracklets = *tracker.NTracklets(); - for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x) + for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(MEM_PLAIN(AliHLTTPCCARow)) / sizeof(int);i += get_local_size(0)) { - reinterpret_cast(&sMem.fRows)[i] = reinterpret_cast(tracker.SliceDataRows())[i]; + reinterpret_cast(&sMem.fRows)[i] = reinterpret_cast(tracker.SliceDataRows())[i]; } currentSlice = iSlice; - __syncthreads(); + GPUsync(); } if (tmpTracklet >= 0 && rMem.fItr < 0) { - rMem.fItr = tmpTracklet + (signed) threadIdx.x - sMem.fTrackletStorePos; + rMem.fItr = tmpTracklet + (signed) get_local_id(0) - sMem.fTrackletStorePos; if (rMem.fItr >= sMem.fNTracklets) { rMem.fItr = -1; @@ -734,15 +733,15 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A if (rMem.fStage == 2) { if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(rMem.fIRow >= rMem.fEndRow || ( rMem.fIRow >= rMem.fStartRow && rMem.fIRow - rMem.fStartRow % 2 == 0))) - pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1; + pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1; } else { if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && rMem.fIRow >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (rMem.fIRow - rMem.fStartRow) % 2 == 0 )) - pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1; + pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1; } #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE - UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, rMem.fIRow); + UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, rMem.fIRow); } if (rMem.fIRow == rMem.fIRowEnd || rMem.fNMissed > kMaxRowGap) @@ -763,9 +762,9 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A if (!rMem.fGo) { - StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam ); + StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam ); rMem.fItr = -1; - CAMath::AtomicAdd(&sMem.fNextTrackletCount, 1); + CAMath::AtomicAddShared(&sMem.fNextTrackletCount, 1); } } } @@ -774,17 +773,19 @@ GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(A #endif //HLTCA_GPU_ALTERNATIVE_SCHEDULER +#ifndef __OPENCL__ GPUg() void AliHLTTPCCATrackletConstructorGPU() { //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU AliHLTTPCCATracker *pTracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker ); - AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(pTracker); + GPUshared() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory) sMem; + AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(pTracker, sMem); } GPUg() void AliHLTTPCCATrackletConstructorGPUPP(int firstSlice, int sliceCount) { - if (blockIdx.x >= sliceCount) return; - AliHLTTPCCATracker *pTracker = &( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[firstSlice + blockIdx.x]; + if (get_group_id(0) >= sliceCount) return; + AliHLTTPCCATracker *pTracker = &( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[firstSlice + get_group_id(0)]; AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP(pTracker); } @@ -795,17 +796,17 @@ GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP( #define startRows sMem.fStartRows #define endRows sMem.fEndRows #else - GPUshared() int startRows[HLTCA_GPU_THREAD_COUNT / HLTCA_GPU_WARP_SIZE + 1]; - GPUshared() int endRows[HLTCA_GPU_THREAD_COUNT / HLTCA_GPU_WARP_SIZE + 1]; + GPUshared() int startRows[HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / HLTCA_GPU_WARP_SIZE + 1]; + GPUshared() int endRows[HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / HLTCA_GPU_WARP_SIZE + 1]; #endif sMem.fNTracklets = *tracker->NTracklets(); - for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x) + for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += get_local_size(0)) { reinterpret_cast(&sMem.fRows)[i] = reinterpret_cast(tracker->SliceDataRows())[i]; } - for (int iTracklet = threadIdx.x;iTracklet < (*tracker->NTracklets() / HLTCA_GPU_THREAD_COUNT + 1) * HLTCA_GPU_THREAD_COUNT;iTracklet += blockDim.x) + for (int iTracklet = get_local_id(0);iTracklet < (*tracker->NTracklets() / HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR + 1) * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;iTracklet += get_local_size(0)) { AliHLTTPCCATrackParam tParam; AliHLTTPCCAThreadMemory rMem; @@ -826,22 +827,22 @@ GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP( rMem.fGo = 1; } - if (threadIdx.x % HLTCA_GPU_WARP_SIZE == 0) + if (get_local_id(0) % HLTCA_GPU_WARP_SIZE == 0) { - startRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 160; - endRows[threadIdx.x / HLTCA_GPU_WARP_SIZE] = 0; + startRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 160; + endRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 0; } - __syncthreads(); + GPUsync(); if (iTracklet < *tracker->NTracklets()) { - CAMath::AtomicMin(&startRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fStartRow); + CAMath::AtomicMinShared(&startRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fStartRow); } - __syncthreads(); + GPUsync(); if (iTracklet < *tracker->NTracklets()) { - for (int j = startRows[threadIdx.x / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++) + for (int j = startRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++) { - UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, *tracker, tParam, j); + UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, *tracker, tParam, j); if (!rMem.fGo) break; } @@ -851,18 +852,20 @@ GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP( { if ( !tParam.TransportToX( tracker->Row( rMem.fEndRow ).X(), tracker->Param().ConstBz(), .999 ) ) rMem.fGo = 0; } - CAMath::AtomicMax(&endRows[threadIdx.x / HLTCA_GPU_WARP_SIZE], rMem.fEndRow); + CAMath::AtomicMaxShared(&endRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fEndRow); } - __syncthreads(); + GPUsync(); if (iTracklet < *tracker->NTracklets()) { for (int j = rMem.fEndRow;j >= 0;j--) { if (!rMem.fGo) break; - UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, *tracker, tParam, j); + UpdateTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, *tracker, tParam, j); } - StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, *tracker, tParam ); + StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, *tracker, tParam ); } } -} \ No newline at end of file +} + +#endif diff --git a/HLT/TPCLib/tracking-ca/cagpu/makefile b/HLT/TPCLib/tracking-ca/cagpu/makefile index 447942a8aee..3d21e022388 100755 --- a/HLT/TPCLib/tracking-ca/cagpu/makefile +++ b/HLT/TPCLib/tracking-ca/cagpu/makefile @@ -1,36 +1,49 @@ -all: libAliHLTTPCCAGPU.so +all: libAliHLTTPCCAGPU.so libAliHLTTPCCAGPUOpenCL.so clean: - rm -f libAliHLTTPCCAGPU.so AliHLTTPCCAGPUTrackerNVCC.o G__AliHLTTPCCAGPU.o AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx AliHLTTPCCAGPUTrackerNVCC.cu.cxx G__AliHLTTPCCAGPUAutoLinkDef.h G__AliHLTTPCCAGPU.h G__AliHLTTPCCAGPU.cxx + rm -f libAliHLTTPCCAGPU*.so AliHLTTPCCAGPUTracker*.o G__AliHLTTPCCAGPU*.o AliHLTTPCCAGPUTrackerNVCC.cu.cxx G__AliHLTTPCCAGPUAutoLinkDef*.h G__AliHLTTPCCAGPU*.h G__AliHLTTPCCAGPU*.cxx makefiles/opencl_compiler AliHLTTPCCAGPUTrackerOpenCLCode.* +libAliHLTTPCCAGPU.so: AliHLTTPCCAGPUTrackerNVCC.o AliHLTTPCCAGPUTrackerBase.o G__AliHLTTPCCAGPUNVCC.o + c++ -shared -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L${ALICE_ROOT}/lib/tgt_${ALICE_TARGET} -L. -lcuda -lcudart -lAliHLTTPC -o $@ $^ -libAliHLTTPCCAGPU.so: AliHLTTPCCAGPUTrackerNVCC.o G__AliHLTTPCCAGPU.o - c++ -shared AliHLTTPCCAGPUTrackerNVCC.o G__AliHLTTPCCAGPU.o -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L${ALICE_ROOT}/lib/tgt_${ALICE_TARGET} -L. -lcuda -lcudart -lAliHLTTPC -o libAliHLTTPCCAGPU.so - +libAliHLTTPCCAGPUOpenCL.so: AliHLTTPCCAGPUTrackerOpenCL.o AliHLTTPCCAGPUTrackerBase.o G__AliHLTTPCCAGPUOpenCL.o AliHLTTPCCAGPUTrackerOpenCLCode.o + c++ -shared -L$(AMDAPPSDKROOT)/lib/x86_64 -L${ALICE_ROOT}/lib/tgt_${ALICE_TARGET} -L. -lOpenCL -lAliHLTTPC -o $@ $^ AliHLTTPCCAGPUTrackerNVCC.o: AliHLTTPCCAGPUTrackerNVCC.cu.cxx - c++ -fPIC -DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -W -Weffc++ -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -c AliHLTTPCCAGPUTrackerNVCC.cu.cxx -o AliHLTTPCCAGPUTrackerNVCC.o + c++ -fPIC -DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -W -Wall -Wshadow -Wno-effc++ -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -c AliHLTTPCCAGPUTrackerNVCC.cu.cxx -o AliHLTTPCCAGPUTrackerNVCC.o + +AliHLTTPCCAGPUTrackerBase.o: AliHLTTPCCAGPUTrackerBase.cxx + c++ -fPIC -DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -W -Weffc++ -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ROOTSYS}/include -c $< -o $@ + +AliHLTTPCCAGPUTrackerOpenCL.o: AliHLTTPCCAGPUTrackerOpenCL.cxx + c++ -fPIC -DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -W -Weffc++ -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -I$(AMDAPPSDKROOT)/include -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ROOTSYS}/include -Imakefiles -Wno-write-strings -c $< -o $@ + + +G__AliHLTTPCCAGPU%.cxx: AliHLTTPCCAGPUTracker%.h G__AliHLTTPCCAGPUAutoLinkDef%.h + rootcint -f $@ -c -Duse_aliroot -Duse_root -DROWHOUGHPARAMS -Duse_reconstruction -Duse_newio -DROOTVERSION=\"unchecked\" -DALIROOTVERSION=\"unchecked\" -D__ROOT__ -DUSE_ALILOG -DLINUX -DNDEBUG -D_MODULE_=\"HLT\" -D`uname` -DDATE_SYS=`uname` -Dlong32='int' -Dlong64='long long' -DdatePointer='long' -I${ROOTSYS}/include -pthread -m64 -DWITHXML -DWITHXML -DUSE_ROOT -DWITHXML -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/BASE/util -I${ALICE_ROOT}/HLT -I${ALICE_ROOT}/HLT/TPCLib -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca $^ -G__AliHLTTPCCAGPU.cxx: G__AliHLTTPCCAGPUAtoLinkDef.h - rootcint -f G__AliHLTTPCCAGPU.cxx -c -Duse_aliroot -Duse_root -DROWHOUGHPARAMS -Duse_reconstruction -Duse_newio -DROOTVERSION=\"unchecked\" -DALIROOTVERSION=\"unchecked\" -D__ROOT__ -DUSE_ALILOG -DLINUX -DNDEBUG -D_MODULE_=\"HLT\" -D`uname` -DDATE_SYS=`uname` -Dlong32='int' -Dlong64='long long' -DdatePointer='long' -I${ROOTSYS}/include -pthread -m64 -DWITHXML -DWITHXML -DUSE_ROOT -DWITHXML -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/BASE/util -I${ALICE_ROOT}/HLT -I${ALICE_ROOT}/HLT/TPCLib -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca AliHLTTPCCAGPUTrackerNVCC.h G__AliHLTTPCCAGPUAutoLinkDef.h +G__AliHLTTPCCAGPUAutoLinkDef%.h: AliHLTTPCCAGPUTracker%.h + echo '//automatically generated ROOT DICT definition' > $@ + echo '//!!! DO NOT EDIT THIS FILE !!!' >> $@ + echo '#ifdef __CINT__' >> $@ + echo '#pragma link off all globals;' >> $@ + echo '#pragma link off all classes;' >> $@ + echo '#pragma link off all functions;' >> $@ + echo "#pragma link C++ class $<+;" | sed "s/\.h//" >> $@ + echo "#pragma link C++ class AliHLTTPCCAGPUTrackerBase+;" >> $@ + echo '#endif' >> $@ -G__AliHLTTPCCAGPUAtoLinkDef.h: AliHLTTPCCAGPUTrackerNVCC.h AliHLTTPCCAGPUTrackerNVCC.cu - echo '//automatically generated ROOT DICT definition' > G__AliHLTTPCCAGPUAutoLinkDef.h - echo '//!!! DO NOT EDIT THIS FILE !!!' >> G__AliHLTTPCCAGPUAutoLinkDef.h - echo '#ifdef __CINT__' >> G__AliHLTTPCCAGPUAutoLinkDef.h - echo '#pragma link off all globals;' >> G__AliHLTTPCCAGPUAutoLinkDef.h - echo '#pragma link off all classes;' >> G__AliHLTTPCCAGPUAutoLinkDef.h - echo '#pragma link off all functions;' >> G__AliHLTTPCCAGPUAutoLinkDef.h - echo "#pragma link C++ class AliHLTTPCCAGPUTrackerNVCC+;" >> G__AliHLTTPCCAGPUAutoLinkDef.h - echo '#endif' >> G__AliHLTTPCCAGPUAutoLinkDef.h +G__AliHLTTPCCAGPU%.o: G__AliHLTTPCCAGPU%.cxx + g++ -DcudaError_t=int -Duse_aliroot -Duse_root -DROWHOUGHPARAMS -Duse_reconstruction -Duse_newio -DROOTVERSION=\"unchecked\" -DALIROOTVERSION=\"unchecked\" -D__ROOT__ -DUSE_ALILOG -DLINUX -DNDEBUG -DBUILD_GPU -D_MODULE_=\"HLT\" -I${ALICE_ROOT}/HLT/TPCLib -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ALICE_ROOT}/HLT/BASE -c $< -o $@ -O -g -W -Wall -Weffc++ -fPIC -pipe -fmessage-length=0 -Wno-long-long -ansi -Dlinux -D`uname` -DDATE_SYS=`uname` -Dlong32='int' -Dlong64='long long' -DdatePointer='long' -I${ROOTSYS}/include -pthread -m64 -D__PHOSUTIL__ -D__EMCALUTIL__ -G__AliHLTTPCCAGPU.o: G__AliHLTTPCCAGPU.cxx - g++ -DcudaError_t=int -Duse_aliroot -Duse_root -DROWHOUGHPARAMS -Duse_reconstruction -Duse_newio -DROOTVERSION=\"unchecked\" -DALIROOTVERSION=\"unchecked\" -D__ROOT__ -DUSE_ALILOG -DLINUX -DNDEBUG -DBUILD_GPU -D_MODULE_=\"HLT\" -I${ALICE_ROOT}/HLT/TPCLib -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ALICE_ROOT}/HLT/BASE -c G__AliHLTTPCCAGPU.cxx -o G__AliHLTTPCCAGPU.o -O -g -W -Wall -Weffc++ -fPIC -pipe -fmessage-length=0 -Wno-long-long -ansi -Dlinux -D`uname` -DDATE_SYS=`uname` -Dlong32='int' -Dlong64='long long' -DdatePointer='long' -I${ROOTSYS}/include -pthread -m64 -D__PHOSUTIL__ -D__EMCALUTIL__ +AliHLTTPCCAGPUTrackerNVCC.cu.cxx: AliHLTTPCCAGPUTrackerNVCC.cu + nvcc --cuda --use_fast_math --maxrregcount 64 -O4 -Xptxas -v -Xptxas -O4 -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 --compiler-options "-DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -IRCU -W -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ROOTSYS}/include" -I. $< --output-file $@ -AliHLTTPCCAGPUTrackerNVCC.cu.cxx: AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx - cat AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx | grep -v "^#" > AliHLTTPCCAGPUTrackerNVCC.cu.cxx - -patch -r /dev/null -s --no-backup-if-mismatch -i AliHLTTPCCAGPUTrackerNVCC.cu.x86_64-pc-linux-gnu.patch AliHLTTPCCAGPUTrackerNVCC.cu.cxx +AliHLTTPCCAGPUTrackerOpenCLCode.o: AliHLTTPCCAGPUTrackerOpenCLCode.bin + gcc -c makefiles/include.S -o $@ -AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx: AliHLTTPCCAGPUTrackerNVCC.cu - nvcc --cuda --use_fast_math --maxrregcount 64 -O4 -Xptxas -v -Xptxas -O4 -gencode arch=compute_20,code=sm_20 --compiler-options "-DPACKAGE_TARNAME=\"alice-hlt\" -DPACKAGE_VERSION=\"35631\" -DPACKAGE_BUGREPORT=\"Matthias.Richter@ift.uib.no\" -DPACKAGE=\"alice-hlt\" -DVERSION=\"35631\" -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1 -DLT_OBJDIR=\".libs/\" -DNDEBUG=1 -Duse_aliroot=1 -Duse_root=1 -DHAVE_HOMERREADER=1 -DHLT_SAMPLE=1 -DHLT_UTIL=1 -DHAVE_ALITPCRAWSTREAM_H=1 -DHLT_TPC=1 -DHAVE_NOT_TPCOFFLINE_REC=1 -DHAVE_TPC_MAPPING=1 -DHAVE_ALIALTRODECODER_H=1 -DHLT_RCU=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_CALO=1 -DHAVE_ALICALORAWSTREAM=1 -DHLT_PHOS=1 -DHLT_EMCAL=1 -DHLT_TRD=1 -DHLT_FMD=1 -DHAVE_ALIMPEXMAP_H=1 -DHAVE_ALIMUONTRIGGERIO_H=1 -DHLT_MUON=1 -DHLT_TRIGGER=1 -DHLT_GLOBAL=1 -DHLT_JET=1 -DHAVE_ALIITSCOMPRESSRAWDATASDD_H=1 -DHLT_ITS=1 -DHLT_COMP=1 -DMODULE=AliHLTTPC -IRCU -W -Weffc++ -Wall -Wshadow -DROOTVERSION=\"5.25/02\" -DALIROOTVERSION=\"Unknown\" -O2 -DBUILD_GPU -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ROOTSYS}/include" -I. AliHLTTPCCAGPUTrackerNVCC.cu --output-file AliHLTTPCCAGPUTrackerNVCC.cu.tmp.cxx +AliHLTTPCCAGPUTrackerOpenCLCode.bin: AliHLTTPCCAGPUTrackerOpenCL.cl makefiles/opencl_compiler + makefiles/opencl_compiler -output-file $@ AliHLTTPCCAGPUTrackerOpenCL.cl -- -I. -I${ALICE_ROOT}/HLT/BASE -I${ALICE_ROOT}/HLT/TPCLib/tracking-ca -I${ROOTSYS}/include -x clc++ +makefiles/opencl_compiler: makefiles/makefile_opencl_compiler.cpp + c++ $< -o $@ -I$(AMDAPPSDKROOT)/include -L$(AMDAPPSDKROOT)/lib/x86_64 -lOpenCL diff --git a/HLT/TPCLib/tracking-ca/cagpu/makefiles/include.S b/HLT/TPCLib/tracking-ca/cagpu/makefiles/include.S new file mode 100644 index 00000000000..5b4e029425c --- /dev/null +++ b/HLT/TPCLib/tracking-ca/cagpu/makefiles/include.S @@ -0,0 +1,8 @@ + .global _makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl + .global _makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl_size + .section .rodata +_makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl: + .incbin "AliHLTTPCCAGPUTrackerOpenCLCode.bin" +1: +_makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl_size: + .int 1b - _makefile_opencl_program_cagpubuild_AliHLTTPCCAGPUTrackerOpenCL_cl diff --git a/HLT/TPCLib/tracking-ca/cagpu/makefiles/makefile_opencl_compiler.cpp b/HLT/TPCLib/tracking-ca/cagpu/makefiles/makefile_opencl_compiler.cpp new file mode 100644 index 00000000000..fa2104e3c35 --- /dev/null +++ b/HLT/TPCLib/tracking-ca/cagpu/makefiles/makefile_opencl_compiler.cpp @@ -0,0 +1,232 @@ +#define _CRT_SECURE_NO_WARNINGS +#include "CL/opencl.h" +#include +#include +#include +#include +#include + +#include "opencl_compiler_structs.h" + +#define quit(arg) {fprintf(stderr, arg "\n");return(1);} +#define DEFAULT_OPENCL_COMPILER_OPTIONS "" +#define DEFAULT_OUTPUT_FILE "opencl.out" + +int main(int argc, char** argv) +{ + const char* output_file = DEFAULT_OUTPUT_FILE; + std::string compiler_options = DEFAULT_OPENCL_COMPILER_OPTIONS; + std::vector files; + + printf("Passing command line options:\n"); + bool add_option = false; + for (int i = 1;i < argc;i++) + { + if (add_option) + { + compiler_options += " "; + compiler_options += argv[i]; + } + else if (strcmp(argv[i], "--") == 0) + { + add_option = true; + } + else if (strcmp(argv[i], "-output-file") == 0) + { + if (++i >= argc) quit("Output file name missing"); + output_file = argv[i]; + } + else + { + fprintf(stderr, "%s\n", argv[i]); + files.push_back(argv[i]); + } + } + + cl_int ocl_error; + cl_uint num_platforms; + if (clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS) quit("Error getting OpenCL Platform Count"); + if (num_platforms == 0) quit("No OpenCL Platform found"); + printf("%d OpenCL Platforms found\n", num_platforms); + + //Query platforms + cl_platform_id* platforms = new cl_platform_id[num_platforms]; + if (platforms == NULL) quit("Memory allocation error"); + if (clGetPlatformIDs(num_platforms, platforms, NULL) != CL_SUCCESS) quit("Error getting OpenCL Platforms"); + + cl_platform_id platform; + bool found = false; + + _makefiles_opencl_platform_info pinfo; + for (unsigned int i_platform = 0;i_platform < num_platforms;i_platform++) + { + clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_PROFILE, 64, pinfo.platform_profile, NULL); + clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VERSION, 64, pinfo.platform_version, NULL); + clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_NAME, 64, pinfo.platform_name, NULL); + clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VENDOR, 64, pinfo.platform_vendor, NULL); + printf("Available Platform %d: (%s %s) %s %s\n", i_platform, pinfo.platform_profile, pinfo.platform_version, pinfo.platform_vendor, pinfo.platform_name); + if (strcmp(pinfo.platform_vendor, "Advanced Micro Devices, Inc.") == 0) + { + found = true; + printf("AMD OpenCL Platform found\n"); + platform = platforms[i_platform]; + break; + } + } + if (found == false) + { + quit("Did not find AMD OpenCL Platform"); + } + + if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &pinfo.count) != CL_SUCCESS) + { + quit("Error getting OPENCL Device Count"); + } + + //Query devices + cl_device_id* devices = new cl_device_id[pinfo.count]; + if (devices == NULL) quit("Memory allocation error"); + if (clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, pinfo.count, devices, NULL) != CL_SUCCESS) quit("Error getting OpenCL devices"); + + _makefiles_opencl_device_info dinfo; + cl_device_type device_type; + cl_uint freq, shaders; + + printf("Available OPENCL devices:\n"); + for (unsigned int i = 0;i < pinfo.count;i++) + { + printf("Examining device %d\n", i); + + clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, dinfo.device_name, NULL); + clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, dinfo.device_vendor, NULL); + clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); + clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL); + clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(shaders), &shaders, NULL); + clGetDeviceInfo(devices[i], CL_DEVICE_ADDRESS_BITS, sizeof(dinfo.nbits), &dinfo.nbits, NULL); + printf("Found Device %d: %s %s (Frequency %d, Shaders %d, %d bit)\n", i, dinfo.device_vendor, dinfo.device_name, (int) freq, (int) shaders, (int) dinfo.nbits); + } + + if (files.size() == 0) + { + quit("Syntax: opencl [-output-file OUTPUT_FILE] FILE1 [FILE2] ... [FILEn] [-- COMPILER_OPTION_1] [COMPILER_OPTION_2] ... [COMPILER_OPTION_N]"); + } + + char** buffers = (char**) malloc(files.size() * sizeof(char*)); + if (buffers == NULL) quit("Memory allocation error\n"); + for (unsigned int i = 0;i < files.size();i++) + { + printf("Reading source file %s\n", files[i]); + FILE* fp = fopen(files[i], "rb"); + if (fp == NULL) + { + printf("Cannot open %s\n", files[i]); + return(1); + } + fseek(fp, 0, SEEK_END); + size_t file_size = ftell(fp); + fseek(fp, 0, SEEK_SET); + + buffers[i] = (char*) malloc(file_size + 1); + if (buffers[i] == NULL) + { + quit("Memory allocation error"); + } + if (fread(buffers[i], 1, file_size, fp) != file_size) + { + quit("Error reading file"); + } + buffers[i][file_size] = 0; + fclose(fp); + } + + printf("Creating OpenCL Context\n"); + //Create OpenCL context + cl_context context = clCreateContext(NULL, pinfo.count, devices, NULL, NULL, &ocl_error); + if (ocl_error != CL_SUCCESS) quit("Error creating OpenCL context"); + + printf("Creating OpenCL Program Object\n"); + //Create OpenCL program object + cl_program program = clCreateProgramWithSource(context, (cl_uint) files.size(), (const char**) buffers, NULL, &ocl_error); + if (ocl_error != CL_SUCCESS) quit("Error creating program object"); + + printf("Compiling OpenCL Program\n"); + //Compile program + ocl_error = clBuildProgram(program, pinfo.count, devices, compiler_options.c_str(), NULL, NULL); + if (ocl_error != CL_SUCCESS) + { + fprintf(stderr, "OpenCL Error while building program: %d (Compiler options: %s)\n", ocl_error, compiler_options.c_str()); + fprintf(stderr, "OpenCL Kernel:\n\n"); + for (unsigned int i = 0;i < files.size();i++) + { + printf("%s\n\n", buffers[i]); + } + + for (unsigned int i = 0;i < pinfo.count;i++) + { + cl_build_status status; + clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_STATUS, sizeof(status), &status, NULL); + if (status == CL_BUILD_ERROR) + { + size_t log_size; + clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char* build_log = (char*) malloc(log_size + 1); + if (build_log == NULL) quit("Memory allocation error"); + clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); + fprintf(stderr, "Build Log (device %d):\n\n%s\n\n", i, build_log); + free(build_log); + } + } + } + for (unsigned int i = 0;i < files.size();i++) + { + free(buffers[i]); + } + free(buffers); + if (ocl_error != CL_SUCCESS) return(1); + + printf("Obtaining program binaries\n"); + size_t* binary_sizes = (size_t*) malloc(pinfo.count * sizeof(size_t)); + if (binary_sizes == NULL) quit("Memory allocation error"); + clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, pinfo.count * sizeof(size_t), binary_sizes, NULL); + char** binary_buffers = (char**) malloc(pinfo.count * sizeof(char*)); + if (binary_buffers == NULL) quit("Memory allocation error"); + for (unsigned int i = 0;i < pinfo.count;i++) + { + printf("Binary size for device %d: %d\n", i, (int) binary_sizes[i]); + binary_buffers[i] = (char*) malloc(binary_sizes[i]); + memset(binary_buffers[i], 0, binary_sizes[i]); + if (binary_buffers[i] == NULL) quit("Memory allocation error"); + } + clGetProgramInfo(program, CL_PROGRAM_BINARIES, pinfo.count * sizeof(char*), binary_buffers, NULL); + + printf("Programs obtained successfully, cleaning up opencl\n"); + clReleaseProgram(program); + clReleaseContext(context); + + printf("Writing binaries to file (%s)\n", output_file); + FILE* fp; + fp = fopen(output_file, "w+b"); + if (fp == NULL) quit("Error opening output file\n"); + const char* magic_bytes = "QOCLPB"; + fwrite(magic_bytes, 1, strlen(magic_bytes) + 1, fp); + fwrite(&pinfo, 1, sizeof(pinfo), fp); + for (unsigned int i = 0;i < pinfo.count;i++) + { + clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, dinfo.device_name, NULL); + clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, dinfo.device_vendor, NULL); + dinfo.binary_size = binary_sizes[i]; + fwrite(&dinfo, 1, sizeof(dinfo), fp); + fwrite(binary_buffers[i], 1, binary_sizes[i], fp); + } + fclose(fp); + + printf("All done, cleaning up remaining buffers\n"); + for (unsigned int i = 0;i < pinfo.count;i++) + { + free(binary_buffers[i]); + } + free(binary_sizes); + free(binary_buffers); + + return(0); +} \ No newline at end of file diff --git a/HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_compiler_structs.h b/HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_compiler_structs.h new file mode 100644 index 00000000000..bba156e37fd --- /dev/null +++ b/HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_compiler_structs.h @@ -0,0 +1,16 @@ +struct _makefiles_opencl_platform_info +{ + char platform_profile[64]; + char platform_version[64]; + char platform_name[64]; + char platform_vendor[64]; + cl_uint count; +}; + +struct _makefiles_opencl_device_info +{ + char device_name[64]; + char device_vendor[64]; + cl_uint nbits; + size_t binary_size; +}; diff --git a/HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_obtain_program.h b/HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_obtain_program.h new file mode 100644 index 00000000000..4c03c68fe86 --- /dev/null +++ b/HLT/TPCLib/tracking-ca/cagpu/makefiles/opencl_obtain_program.h @@ -0,0 +1,86 @@ +#ifndef MAKEFILES_OPENCL_OBTAIN_PROGRAMH +#define MAKEFILES_OPENCL_OBTAIN_PROGRAMH + +#include +#include +#include "opencl_compiler_structs.h" + +static int _makefiles_opencl_obtain_program_helper(cl_context context, cl_uint num_devices, cl_device_id* devices, cl_program* program, char* binaries) +{ + const char* magic_bytes = "QOCLPB"; + if (strncmp(magic_bytes, binaries, strlen(magic_bytes)) != 0) + { + printf("Internal error accessing opencl program\n"); + return(1); + } + char* current_ptr = binaries + strlen(magic_bytes) + 1; + _makefiles_opencl_platform_info* pinfo = (_makefiles_opencl_platform_info*) current_ptr; + current_ptr += sizeof(_makefiles_opencl_platform_info); + + if (num_devices != pinfo->count) + { + printf("Number of devices differs from number of devices in opencl program\n"); + return(1); + } + //printf("Obtaining program for OpenCL Platform: (%s %s) %s %s\n", pinfo->platform_profile, pinfo->platform_version, pinfo->platform_vendor, pinfo->platform_name); + + std::vector program_sizes(pinfo->count); + std::vector program_binaries(pinfo->count); + + for (unsigned int i = 0;i < pinfo->count;i++) + { + char device_name[64], device_vendor[64]; + cl_uint nbits; + clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 64, device_name, NULL); + clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 64, device_vendor, NULL); + clGetDeviceInfo(devices[i], CL_DEVICE_ADDRESS_BITS, sizeof(nbits), &nbits, NULL); + _makefiles_opencl_device_info* dinfo = (_makefiles_opencl_device_info*) current_ptr; + if (strcmp(device_name, dinfo->device_name) != 0 || strcmp(device_vendor, dinfo->device_vendor) != 0) + { + printf("Device list is different to device list from opencl program\n"); + return(1); + } + if (nbits != dinfo->nbits) + { + printf("Pointer size of device and stored device binary differs\n"); + return(1); + } + current_ptr += sizeof(_makefiles_opencl_device_info); + //printf("Device %d: %s %s (size %lld)\n", i, dinfo->device_vendor, dinfo->device_name, (long long int) dinfo->binary_size); + program_sizes[i] = dinfo->binary_size; + program_binaries[i] = current_ptr; + current_ptr += dinfo->binary_size; + } + + std::vector return_status(pinfo->count); + cl_int ocl_error; + *program = clCreateProgramWithBinary(context, num_devices, devices, program_sizes.data(), (const unsigned char**) program_binaries.data(), return_status.data(), &ocl_error); + + if (ocl_error != CL_SUCCESS) + { + printf("Error loading program\n"); + return(1); + } + + for (unsigned int i = 0;i < pinfo->count;i++) + { + if (return_status[i] != CL_SUCCESS) + { + printf("Error loading program for device %d\n", i); + clReleaseProgram(*program); + return(1); + } + } + + ocl_error = clBuildProgram(*program, num_devices, devices, "", NULL, NULL); + if (ocl_error != CL_SUCCESS) + { + printf("Error building program\n"); + clReleaseProgram(*program); + return(1); + } + + return(0); +} + +#endif \ No newline at end of file -- 2.43.5