// *
//***************************************************************************
-#include "AliHLTTPCCAGPUTracker.h"
+#include "AliHLTTPCCAGPUTrackerNVCC.h"
-#ifdef BUILD_GPU
+#ifdef HLTCA_GPUCODE
+#include <cuda.h>
+#include <sm_11_atomic_functions.h>
+#include <sm_12_atomic_functions.h>
+#endif
+
+#ifdef R__WIN32
+#else
+#include <sys/syscall.h>
+#include <semaphore.h>
+#include <fcntl.h>
+#endif
#include "AliHLTTPCCADef.h"
#include "AliHLTTPCCAGPUConfig.h"
-#include <sm_11_atomic_functions.h>
-#include <sm_12_atomic_functions.h>
#include <iostream>
texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
#endif
-#include "AliHLTTPCCAHit.h"
-
//Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
#include "AliHLTTPCCATrackParam.cxx"
#include "AliHLTTPCCATrack.cxx"
-#include "AliHLTTPCCATrackletSelector.cxx"
-
#include "AliHLTTPCCAHitArea.cxx"
#include "AliHLTTPCCAGrid.cxx"
#include "AliHLTTPCCARow.cxx"
#include "AliHLTTPCCAParam.cxx"
#include "AliHLTTPCCATracker.cxx"
-#include "AliHLTTPCCAOutTrack.cxx"
-
#include "AliHLTTPCCAProcess.h"
+#include "AliHLTTPCCATrackletSelector.cxx"
#include "AliHLTTPCCANeighboursFinder.cxx"
-
#include "AliHLTTPCCANeighboursCleaner.cxx"
#include "AliHLTTPCCAStartHitsFinder.cxx"
#include "AliHLTTPCCAStartHitsSorter.cxx"
#include "AliHLTSystem.h"
#endif
-ClassImp( AliHLTTPCCAGPUTracker )
+ClassImp( AliHLTTPCCAGPUTrackerNVCC )
+
+bool AliHLTTPCCAGPUTrackerNVCC::fgGPUUsed = false;
+
+#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"
+
+AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :
+ fGpuTracker(NULL),
+ fGPUMemory(NULL),
+ fHostLockedMemory(NULL),
+ fDebugLevel(0),
+ fOutFile(NULL),
+ fGPUMemSize(0),
+ fpCudaStreams(NULL),
+ fSliceCount(0),
+ fOutputControl(NULL),
+ fThreadId(0),
+ fCudaInitialized(0)
+ {};
+
+AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() {};
+
+void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)
+{
+ //Release the global named semaphore that locks GPU Initialization
+#ifdef R__WIN32
+ HANDLE* h = (HANDLE*) sem;
+ ReleaseSemaphore(*h, 1, NULL);
+ CloseHandle(*h);
+ delete h;
+#else
+ sem_t* pSem = (sem_t*) sem;
+ sem_post(pSem);
+ sem_unlink(SemLockName);
+#endif
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)
+{
+ //Check constants for correct memory sizes
+ if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
+ {
+ HLTError("Insufficiant Tracker Object Memory");
+ return(1);
+ }
+
+ if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
+ {
+ HLTError("Insufficiant Common Memory");
+ return(1);
+ }
-bool AliHLTTPCCAGPUTracker::fgGPUUsed = false;
+ if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
+ {
+ HLTError("Insufficiant Row Memory");
+ return(1);
+ }
+ return(0);
+}
-int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
+int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)
{
//Find best CUDA device, initialize and allocate memory
-
+
+ if (CheckMemorySizes(sliceCount)) return(1);
+
+#ifdef R__WIN32
+ HANDLE* semLock = new HANDLE;
+ *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);
+ if (*semLock == NULL)
+ {
+ HLTError("Error creating GPUInit Semaphore");
+ return(1);
+ }
+ WaitForSingleObject(*semLock, INFINITE);
+#else
+ sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);
+ if (semLock == SEM_FAILED)
+ {
+ HLTError("Error creating GPUInit Semaphore");
+ return(1);
+ }
+ sem_wait(semLock);
+#endif
+
if (fgGPUUsed)
{
HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
+ ReleaseGlobalLock(semLock);
return(1);
}
+ fgGPUUsed = 1;
+ fThreadId = GetThread();
cudaDeviceProp fCudaDeviceProp;
+ fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
+
#ifndef CUDA_DEVICE_EMULATION
- int count, bestDevice = -1, bestDeviceSpeed = 0;
+ int count, bestDevice = -1;
+ long long int bestDeviceSpeed = 0, deviceSpeed;
if (CudaFailedMsg(cudaGetDeviceCount(&count)))
{
HLTError("Error getting CUDA Device Count");
+ fgGPUUsed = 0;
+ ReleaseGlobalLock(semLock);
return(1);
}
- if (fDebugLevel >= 2) std::cout << "Available CUDA devices: ";
+ if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
for (int i = 0;i < count;i++)
{
- cudaGetDeviceProperties(&fCudaDeviceProp, i);
- if (fDebugLevel >= 2) std::cout << fCudaDeviceProp.name << " (" << i << ") ";
- if (fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate > bestDeviceSpeed)
+ unsigned int free, total;
+ cuInit(0);
+ CUdevice tmpDevice;
+ cuDeviceGet(&tmpDevice, i);
+ CUcontext tmpContext;
+ cuCtxCreate(&tmpContext, 0, tmpDevice);
+ if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
+ cuCtxDestroy(tmpContext);
+ CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i));
+
+ int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;
+
+ if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
+ deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;
+ if (deviceOK && deviceSpeed > bestDeviceSpeed)
{
bestDevice = i;
- bestDeviceSpeed = fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate;
+ bestDeviceSpeed = deviceSpeed;
}
}
- if (fDebugLevel >= 2) std::cout << std::endl;
-
if (bestDevice == -1)
{
HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
+ fgGPUUsed = 0;
+ ReleaseGlobalLock(semLock);
return(1);
}
if (fDebugLevel >= 1)
{
- std::cout<<"CUDA Device Properties: "<<std::endl;
- std::cout<<"name = "<<fCudaDeviceProp.name<<std::endl;
- std::cout<<"totalGlobalMem = "<<fCudaDeviceProp.totalGlobalMem<<std::endl;
- std::cout<<"sharedMemPerBlock = "<<fCudaDeviceProp.sharedMemPerBlock<<std::endl;
- std::cout<<"regsPerBlock = "<<fCudaDeviceProp.regsPerBlock<<std::endl;
- std::cout<<"warpSize = "<<fCudaDeviceProp.warpSize<<std::endl;
- std::cout<<"memPitch = "<<fCudaDeviceProp.memPitch<<std::endl;
- std::cout<<"maxThreadsPerBlock = "<<fCudaDeviceProp.maxThreadsPerBlock<<std::endl;
- std::cout<<"maxThreadsDim = "<<fCudaDeviceProp.maxThreadsDim[0]<<" "<<fCudaDeviceProp.maxThreadsDim[1]<<" "<<fCudaDeviceProp.maxThreadsDim[2]<<std::endl;
- std::cout<<"maxGridSize = " <<fCudaDeviceProp.maxGridSize[0]<<" "<<fCudaDeviceProp.maxGridSize[1]<<" "<<fCudaDeviceProp.maxGridSize[2]<<std::endl;
- std::cout<<"totalConstMem = "<<fCudaDeviceProp.totalConstMem<<std::endl;
- std::cout<<"major = "<<fCudaDeviceProp.major<<std::endl;
- std::cout<<"minor = "<<fCudaDeviceProp.minor<<std::endl;
- std::cout<<"clockRate = "<<fCudaDeviceProp.clockRate<<std::endl;
- std::cout<<"textureAlignment = "<<fCudaDeviceProp.textureAlignment<<std::endl;
+ HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
+ HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);
+ HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);
+ HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
+ HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
+ HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);
+ HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
+ HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
+ HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
+ HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);
+ HLTInfo("major = %d", fCudaDeviceProp.major);
+ HLTInfo("minor = %d", fCudaDeviceProp.minor);
+ HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);
+ HLTInfo("textureAlignment %lld= ", (unsigned long long int) fCudaDeviceProp.textureAlignment);
}
if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
{
HLTError( "Unsupported CUDA Device" );
- return(1);
+ fgGPUUsed = 0;
+ ReleaseGlobalLock(semLock);
+ return(1);
}
if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
{
HLTError("Could not set CUDA Device!");
+ fgGPUUsed = 0;
+ ReleaseGlobalLock(semLock);
return(1);
}
- if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
- {
- HLTError("Insufficiant Common Memory");
- cudaThreadExit();
- return(1);
- }
-
- if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
- {
- HLTError("Insufficiant Row Memory");
- cudaThreadExit();
- return(1);
- }
-
- fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
{
HLTError("CUDA Memory Allocation Error");
cudaThreadExit();
+ fgGPUUsed = 0;
+ ReleaseGlobalLock(semLock);
return(1);
}
+ ReleaseGlobalLock(semLock);
if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
cudaFree(fGPUMemory);
cudaThreadExit();
HLTError("Error allocating Page Locked Host Memory");
+ fgGPUUsed = 0;
return(1);
}
if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
{
CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
}
- HLTInfo("CUDA Initialisation successfull");
- //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
- if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
- {
- HLTError("Insufficiant Tracker Object Memory");
- return(1);
- }
fSliceCount = sliceCount;
+ //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
for (int i = 0;i < fgkNSlices;i++)
{
fSlaveTrackers[i].SetGPUTracker();
fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
- fSlaveTrackers[i].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
+ fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
}
fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
{
if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
{
- HLTError("Error creating CUDA Stream");
- return(1);
+ cudaFree(fGPUMemory);
+ cudaFreeHost(fHostLockedMemory);
+ cudaThreadExit();
+ HLTError("Error creating CUDA Stream");
+ fgGPUUsed = 0;
+ return(1);
}
}
+ fCudaInitialized = 1;
+ HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d)", cudaDevice, fCudaDeviceProp.name, fThreadId);
+
#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
if (fDebugLevel < 2)
{
//Do one initial run for Benchmark reasons
const int useDebugLevel = fDebugLevel;
fDebugLevel = 0;
- AliHLTTPCCAClusterData tmpCluster;
+ AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];
std::ifstream fin;
- fin.open("events/event.0.dump");
- tmpCluster.ReadEvent(fin);
- fin.close();
- AliHLTTPCCASliceOutput *tmpOutput = NULL;
AliHLTTPCCAParam tmpParam;
AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;
- fSlaveTrackers[0].SetOutputControl(&tmpOutputControl);
- tmpParam.SetNRows(HLTCA_ROW_COUNT);
- fSlaveTrackers[0].SetParam(tmpParam);
- Reconstruct(&tmpOutput, &tmpCluster, 0, 1);
- free(tmpOutput);
- tmpOutput = NULL;
- fSlaveTrackers[0].SetOutputControl(NULL);
+
+ fin.open("events/settings.dump");
+ int tmpCount;
+ fin >> tmpCount;
+ for (int i = 0;i < sliceCount;i++)
+ {
+ fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);
+ tmpParam.ReadSettings(fin);
+ InitializeSliceParam(i, tmpParam);
+ }
+ fin.close();
+
+ fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);
+ for (int i = 0;i < sliceCount;i++)
+ {
+ tmpCluster[i].StartReading(i, 0);
+ tmpCluster[i].ReadEvent(fin);
+ tmpCluster[i].FinishReading();
+ }
+ fin.close();
+
+ AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];
+ memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));
+
+ Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);
+ for (int i = 0;i < sliceCount;i++)
+ {
+ free(tmpOutput[i]);
+ tmpOutput[i] = NULL;
+ fSlaveTrackers[i].SetOutputControl(NULL);
+ }
+ delete[] tmpOutput;
+ delete[] tmpCluster;
fDebugLevel = useDebugLevel;
}
#endif
- fgGPUUsed = true;
return(0);
}
-template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment)
+template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)
{
//Macro to align Pointers.
//Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
return((T*) adr);
}
-bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
+bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsg(cudaError_t error)
{
//Check for CUDA Error and in the case of an error display the corresponding error string
if (error == cudaSuccess) return(false);
return(true);
}
-int AliHLTTPCCAGPUTracker::CUDASync(char* state)
+int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state)
{
//Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
return(0);
}
-void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
+void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
{
//Set Debug Level and Debug output File if applicable
fDebugLevel = dwLevel;
if (NewOutFile) fOutFile = NewOutFile;
}
-int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
+int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int /*OptionValue*/)
{
//Set a specific GPU Tracker Option
{
}
#ifdef HLTCA_STANDALONE
-void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
+void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)
{
//Run Performance Query for timer i of slice iSlice
if (fDebugLevel >= 1)
}
}
#else
-void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
+void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
#endif
-void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
+void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
{
//Dump Rowblocks to File
if (fDebugLevel >= 4)
for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
{
*fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " <<
- rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << endl << "Phase 1: ";
+ rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";
for (int j = 0;j < rowBlockPos[i].x;j++)
{
//Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
HLTError("Error, -1 Tracklet found");
}
}
- *fOutFile << endl << "Phase 2: ";
+ *fOutFile << std::endl << "Phase 2: ";
for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
{
*fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
}
- *fOutFile << endl;
+ *fOutFile << std::endl;
}
if (check)
sliceDataHitWeights4[i] = i0;
}
-int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
+int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
{
//Primary reconstruction function
cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
- if (sliceCountLocal == -1) sliceCountLocal = this->fSliceCount;
+ if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
+
+ if (!fCudaInitialized)
+ {
+ HLTError("GPUTracker not initialized");
+ return(1);
+ }
+ if (sliceCountLocal > fSliceCount)
+ {
+ HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
+ return(1);
+ }
+ if (fThreadId != GetThread())
+ {
+ HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());
+ return(1);
+ }
+
+ if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
{
{
for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
{
- *fOutFile << endl << endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << endl;
+ *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
}
}
memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
- if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice + sliceCountLocal].Param().ISlice());
if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
#ifdef HLTCA_GPU_TIME_PROFILE
- __int64 a, b, c, d;
- QueryPerformanceFrequency((LARGE_INTEGER*) &c);
- QueryPerformanceCounter((LARGE_INTEGER*) &d);
+ unsigned __int64 a, b, c, d;
+ AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);
+ AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);
#endif
for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
//Make this a GPU Tracker
fGpuTracker[iSlice].SetGPUTracker();
fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
- fGpuTracker[iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
- fGpuTracker[iSlice].pData()->SetPointers(&pClusterData[iSlice], false);
+ fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
+ fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);
//Set Pointers to GPU Memory
char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
- fGpuTracker[iSlice].pData()->SetGPUTextureBase(fGpuTracker[0].Data().Memory());
+ fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());
}
#ifdef HLTCA_GPU_TEXTURE_FETCH
size_t offset;
if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
{
- HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+ HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
return(1);
}
cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
{
- HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+ HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
return(1);
}
cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
{
- HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+ HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
return(1);
}
#endif
//Initialize GPU Slave Tracker
if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");
- fSlaveTrackers[firstSlice + iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
+ fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
#ifdef HLTCA_GPU_TIME_PROFILE
- QueryPerformanceCounter((LARGE_INTEGER*) &a);
+ AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
#endif
fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
+
+ if (fDebugLevel >= 4)
+ {
+ *fOutFile << std::endl << std::endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << std::endl;
+ *fOutFile << "Slice Data:" << std::endl;
+ fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);
+ }
+
#ifdef HLTCA_GPU_TIME_PROFILE
- QueryPerformanceCounter((LARGE_INTEGER*) &b);
+ AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
#endif
if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
if (fDebugLevel >= 4)
{
if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
- fSlaveTrackers[firstSlice + iSlice].TrackletMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] );
- fSlaveTrackers[firstSlice + iSlice].SetPointersTracklets( HLTCA_GPU_MAX_TRACKLETS );
- fSlaveTrackers[firstSlice + iSlice].HitMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100] );
- fSlaveTrackers[firstSlice + iSlice].SetPointersHits( pClusterData[iSlice].NumberOfClusters() );
+ fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS);
+ fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );
}
if (CUDASync("Initialization")) return(1);
}
#endif
+ int nHardCollisions = 0;
+
+RestartTrackletConstructor:
if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
{
}
}
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT)
+ int runSlices = 0;
+ for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)
{
- if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT);
- AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT, sliceCountLocal - iSlice));
+ if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;
+ if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);
+ AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));
}
if (CUDASync("Tracklet Selector")) return 1;
StandalonePerfTime(firstSlice, 9);
if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
{
+ if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION && nHardCollisions++ < 10)
+ {
+ HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");
+ for (int i = 0;i < sliceCountLocal;i++)
+ {
+ cudaThreadSynchronize();
+ CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
+ *fSlaveTrackers[firstSlice + i].NTracks() = 0;
+ *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
+ fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
+ CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
+ PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
+ }
+ goto RestartTrackletConstructor;
+ }
HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
return(1);
}
fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
#ifdef HLTCA_GPU_TIME_PROFILE
- QueryPerformanceCounter((LARGE_INTEGER*) &a);
+ AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);
#endif
fSlaveTrackers[firstSlice + iSlice].WriteOutput();
#ifdef HLTCA_GPU_TIME_PROFILE
- QueryPerformanceCounter((LARGE_INTEGER*) &b);
+ AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);
printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);
#endif
return(0);
}
-int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m)
+int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m)
{
//Initialize Slice Tracker Parameter for a slave tracker
fSlaveTrackers[iSlice].Initialize(param);
return(0);
}
-int AliHLTTPCCAGPUTracker::ExitGPU()
+int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()
{
//Uninitialize CUDA
cudaThreadSynchronize();
}
HLTInfo("CUDA Uninitialized");
fgGPUUsed = false;
+ fCudaInitialized = 0;
return(0);
}
-void AliHLTTPCCAGPUTracker::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
+void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)
{
+ //Set Output Control Pointers
fOutputControl = val;
for (int i = 0;i < fgkNSlices;i++)
{
}
}
-#endif
\ No newline at end of file
+int AliHLTTPCCAGPUTrackerNVCC::GetThread()
+{
+ //Get Thread ID
+#ifdef R__WIN32
+ return((int) (size_t) GetCurrentThread());
+#else
+ return((int) syscall (SYS_gettid));
+#endif
+}
+
+unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)
+{
+ //Returns pointer to PerfTimer i of slice iSlice
+ return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);
+}
+
+const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const
+{
+ //Return Pointer to Output Control Structure
+ return fOutputControl;
+}
+
+int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const
+{
+ //Return max slice count processable
+ return(fSliceCount);
+}
+
+AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()
+{
+ return new AliHLTTPCCAGPUTrackerNVCC;
+}
+void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)
+{
+ delete ptr;
+}
+