GPU update from David Rohr
authorsgorbuno <sgorbuno@f7af4fe6-9843-0410-8265-dc069ae4e863>
Wed, 28 Oct 2009 18:05:14 +0000 (18:05 +0000)
committersgorbuno <sgorbuno@f7af4fe6-9843-0410-8265-dc069ae4e863>
Wed, 28 Oct 2009 18:05:14 +0000 (18:05 +0000)
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTrackerNVCC.cu
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.cxx

index 7a88954..0bedc74 100644 (file)
@@ -27,7 +27,9 @@ public:
          fGPUMemSize(0),
          fpCudaStreams(NULL),
          fSliceCount(0),
-         fOutputControl(NULL)
+         fOutputControl(NULL),
+         fThreadId(0),
+         fCudaInitialized(0)
          {};
          ~AliHLTTPCCAGPUTracker() {};
 
@@ -83,6 +85,8 @@ private:
        AliHLTTPCCASliceOutput::outputControlStruct* fOutputControl;
        
        static bool fgGPUUsed;
+       int fThreadId;
+       int fCudaInitialized;
 
        // disable copy
        AliHLTTPCCAGPUTracker( const AliHLTTPCCAGPUTracker& );
index c9cb853..97114bf 100644 (file)
@@ -21,6 +21,9 @@
 
 #ifdef BUILD_GPU
 
+#include <cuda.h>
+#include <sys/syscall.h>
+
 #include "AliHLTTPCCADef.h"
 #include "AliHLTTPCCAGPUConfig.h"
 
@@ -82,32 +85,49 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
            HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");
            return(1);
        }
+       fgGPUUsed = 1;
+       fThreadId = (int) syscall (SYS_gettid);
 
        cudaDeviceProp fCudaDeviceProp;
 
+       fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
+
 #ifndef CUDA_DEVICE_EMULATION
-       int count, bestDevice = -1, bestDeviceSpeed = 0;
+       int count, bestDevice = -1;
+       long long int bestDeviceSpeed = 0, deviceSpeed;
        if (CudaFailedMsg(cudaGetDeviceCount(&count)))
        {
                HLTError("Error getting CUDA Device Count");
+               fgGPUUsed = 0;
                return(1);
        }
-       if (fDebugLevel >= 2) std::cout << "Available CUDA devices: ";
+       if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");
        for (int i = 0;i < count;i++)
        {
-               cudaGetDeviceProperties(&fCudaDeviceProp, i);
-               if (fDebugLevel >= 2) std::cout << fCudaDeviceProp.name << " (" << i << ")     ";
-               if (fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate > bestDeviceSpeed)
+               unsigned int free, total;
+               cuInit(0);
+               CUdevice tmpDevice;
+               cuDeviceGet(&tmpDevice, i);
+               CUcontext tmpContext;
+               cuCtxCreate(&tmpContext, 0, tmpDevice);
+               if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";
+               cuCtxDestroy(tmpContext);
+               CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i));
+
+               int deviceOK = fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;
+
+               if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %d)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");
+               deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;
+               if (deviceOK && deviceSpeed > bestDeviceSpeed)
                {
                        bestDevice = i;
-                       bestDeviceSpeed = fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate;
+                       bestDeviceSpeed = deviceSpeed;
                }
        }
-       if (fDebugLevel >= 2) std::cout << std::endl;
-
        if (bestDevice == -1)
        {
                HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
+               fgGPUUsed = 0;
                return(1);
        }
 
@@ -124,32 +144,40 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
 
   if (fDebugLevel >= 1)
   {
-         std::cout<<"CUDA Device Properties: "<<std::endl;
-         std::cout<<"name = "<<fCudaDeviceProp.name<<std::endl;
-         std::cout<<"totalGlobalMem = "<<fCudaDeviceProp.totalGlobalMem<<std::endl;
-         std::cout<<"sharedMemPerBlock = "<<fCudaDeviceProp.sharedMemPerBlock<<std::endl;
-         std::cout<<"regsPerBlock = "<<fCudaDeviceProp.regsPerBlock<<std::endl;
-         std::cout<<"warpSize = "<<fCudaDeviceProp.warpSize<<std::endl;
-         std::cout<<"memPitch = "<<fCudaDeviceProp.memPitch<<std::endl;
-         std::cout<<"maxThreadsPerBlock = "<<fCudaDeviceProp.maxThreadsPerBlock<<std::endl;
-         std::cout<<"maxThreadsDim = "<<fCudaDeviceProp.maxThreadsDim[0]<<" "<<fCudaDeviceProp.maxThreadsDim[1]<<" "<<fCudaDeviceProp.maxThreadsDim[2]<<std::endl;
-         std::cout<<"maxGridSize = "  <<fCudaDeviceProp.maxGridSize[0]<<" "<<fCudaDeviceProp.maxGridSize[1]<<" "<<fCudaDeviceProp.maxGridSize[2]<<std::endl;
-         std::cout<<"totalConstMem = "<<fCudaDeviceProp.totalConstMem<<std::endl;
-         std::cout<<"major = "<<fCudaDeviceProp.major<<std::endl;
-         std::cout<<"minor = "<<fCudaDeviceProp.minor<<std::endl;
-         std::cout<<"clockRate = "<<fCudaDeviceProp.clockRate<<std::endl;
-         std::cout<<"textureAlignment = "<<fCudaDeviceProp.textureAlignment<<std::endl;
+         HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);
+         HLTInfo("totalGlobalMem = %d", fCudaDeviceProp.totalGlobalMem);
+         HLTInfo("sharedMemPerBlock = %d", fCudaDeviceProp.sharedMemPerBlock);
+         HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);
+         HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);
+         HLTInfo("memPitch = %d", fCudaDeviceProp.memPitch);
+         HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);
+         HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);
+         HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);
+         HLTInfo("totalConstMem = %d", fCudaDeviceProp.totalConstMem);
+         HLTInfo("major = %d", fCudaDeviceProp.major);
+         HLTInfo("minor = %d", fCudaDeviceProp.minor);
+         HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);
+         HLTInfo("textureAlignment %d= ", fCudaDeviceProp.textureAlignment);
   }
 
   if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
   {
        HLTError( "Unsupported CUDA Device" );
-         return(1);
+       fgGPUUsed = 0;
+       return(1);
   }
 
+  if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
+  {
+         HLTError("Insufficiant Tracker Object Memory");
+         fgGPUUsed = 0;
+         return(1);
+  }
+  
   if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
   {
          HLTError("Could not set CUDA Device!");
+         fgGPUUsed = 0;
          return(1);
   }
 
@@ -157,6 +185,7 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
   {
          HLTError("Insufficiant Common Memory");
          cudaThreadExit();
+         fgGPUUsed = 0;
          return(1);
   }
 
@@ -164,14 +193,15 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
   {
          HLTError("Insufficiant Row Memory");
          cudaThreadExit();
+         fgGPUUsed = 0;
          return(1);
   }
 
-  fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
   if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
   {
          HLTError("CUDA Memory Allocation Error");
          cudaThreadExit();
+         fgGPUUsed = 0;
          return(1);
   }
   if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
@@ -181,6 +211,7 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
          cudaFree(fGPUMemory);
          cudaThreadExit();
          HLTError("Error allocating Page Locked Host Memory");
+         fgGPUUsed = 0;
          return(1);
   }
   if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
@@ -189,15 +220,9 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
   {
          CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
   }
-  HLTInfo("CUDA Initialisation successfull");
 
-  //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
-  if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
-  {
-         HLTError("Insufficiant Tracker Object Memory");
-         return(1);
-  }
   fSliceCount = sliceCount;
+  //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
   fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
 
   for (int i = 0;i < fgkNSlices;i++)
@@ -213,11 +238,18 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
   {
        if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
        {
-               HLTError("Error creating CUDA Stream");
-               return(1);
+           cudaFree(fGPUMemory);
+           cudaFreeHost(fHostLockedMemory);
+           cudaThreadExit();
+           HLTError("Error creating CUDA Stream");
+           fgGPUUsed = 0;
+           return(1);
        }
   }
 
+  fCudaInitialized = 1;
+  HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %dd)", cudaDevice, fCudaDeviceProp.name, fThreadId);
+
 #if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
   if (fDebugLevel < 2)
   {
@@ -266,7 +298,6 @@ int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
          fDebugLevel = useDebugLevel;
   }
 #endif
-    fgGPUUsed = true;
   return(0);
 }
 
@@ -423,7 +454,25 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
        //Primary reconstruction function
        cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 
-       if (sliceCountLocal == -1) sliceCountLocal = this->fSliceCount;
+       if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;
+       
+       if (!fCudaInitialized)
+       {
+           HLTError("GPUTracker not initialized");
+           return(1);
+       }
+       if (sliceCountLocal > fSliceCount)
+       {
+           HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);
+           return(1);
+       }
+       if (fThreadId != (int) syscall (SYS_gettid))
+       {
+           HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, (int) syscall (SYS_gettid));
+           return(1);
+       }
+
+       if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);
 
        if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
        {
@@ -441,7 +490,6 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
 
        memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);
 
-       if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice + sliceCountLocal].Param().ISlice());
        if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");
 
 #ifdef HLTCA_GPU_TIME_PROFILE
@@ -502,19 +550,19 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
                size_t offset;
                if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
                {
-                       HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+                       HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);
                        return(1);
                }
                cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
                if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
                {
-                       HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+                       HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);
                        return(1);
                }
                cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
                if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)
                {
-                       HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+                       HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);
                        return(1);
                }
 #endif
@@ -668,6 +716,9 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
        }
 #endif
 
+       int nHardCollisions = 0;
+
+RestartTrackletConstructor:
        if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");
        for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
        {
@@ -736,6 +787,21 @@ int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTT
 
                        if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
                        {
+                               if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION && nHardCollisions++ < 10)
+                               {
+                                       HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");
+                                       for (int i = 0;i < sliceCountLocal;i++)
+                                       {
+                                               cudaThreadSynchronize();
+                                               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));
+                                               *fSlaveTrackers[firstSlice + i].NTracks() = 0;
+                                               *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;
+                                               fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;
+                                               CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));
+                                               PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());
+                                       }
+                                       goto RestartTrackletConstructor;
+                               }
                                HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
                                return(1);
                        }
@@ -863,6 +929,7 @@ int AliHLTTPCCAGPUTracker::ExitGPU()
        }
        HLTInfo("CUDA Uninitialized");
        fgGPUUsed = false;
+       fCudaInitialized = 0;
        return(0);
 }
 
index bcfaa50..799e32a 100644 (file)
@@ -109,7 +109,7 @@ AliHLTTPCCATrackerComponent& AliHLTTPCCATrackerComponent::operator=( const AliHL
 AliHLTTPCCATrackerComponent::~AliHLTTPCCATrackerComponent()
 {
   // see header file for class documentation
-  delete fTracker;
+  if (fTracker) delete fTracker;
 }
 
 //
@@ -327,7 +327,8 @@ int AliHLTTPCCATrackerComponent::DoInit( int argc, const char** argv )
   if ( fTracker ) return EINPROGRESS;
 
 
-  fTracker = new AliHLTTPCCATrackerFramework();
+  //fTracker = new AliHLTTPCCATrackerFramework();
+  //Do not initialize the TrackerFramework here since the CUDA framework is thread local and DoInit is called from different thread than DoEvent
 
   TString arguments = "";
   for ( int i = 0; i < argc; i++ ) {
@@ -342,7 +343,7 @@ int AliHLTTPCCATrackerComponent::DoInit( int argc, const char** argv )
 int AliHLTTPCCATrackerComponent::DoDeinit()
 {
   // see header file for class documentation
-  delete fTracker;
+  if (fTracker) delete fTracker;
   fTracker = NULL;
   return 0;
 }
@@ -492,6 +493,7 @@ int AliHLTTPCCATrackerComponent::DoEvent
   }
 
   if ( !fTracker ) fTracker = new AliHLTTPCCATrackerFramework;
+
   int slicecount = maxslice + 1 - minslice;
   if (slicecount > fTracker->MaxSliceCount())
   {
index b911b2c..65be031 100644 (file)
@@ -881,10 +881,10 @@ GPUd() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tra
                int nTryCount = 0;
                while ((nTracklet = *ptrTracklet) == -1)
                {
-                       for (int i = 0;i < 10000;i++)
+                       for (int i = 0;i < 20000;i++)
                                sMem.fNextTrackletStupidDummy++;
                        nTryCount++;
-                       if (nTryCount > 20)
+                       if (nTryCount > 30)
                        {
                                tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_SCHEDULE_COLLISION;
                                return(-1);