+++ /dev/null
-// **************************************************************************\r
-// This file is property of and copyright by the ALICE HLT Project *\r
-// ALICE Experiment at CERN, All rights reserved. *\r
-// *\r
-// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *\r
-// Ivan Kisel <kisel@kip.uni-heidelberg.de> *\r
-// David Rohr <drohr@kip.uni-heidelberg.de> *\r
-// for The ALICE HLT Project. *\r
-// *\r
-// Permission to use, copy, modify and distribute this software and its *\r
-// documentation strictly for non-commercial purposes is hereby granted *\r
-// without fee, provided that the above copyright notice appears in all *\r
-// copies and that both the copyright notice and this permission notice *\r
-// appear in the supporting documentation. The authors make no claims *\r
-// about the suitability of this software for any purpose. It is *\r
-// provided "as is" without express or implied warranty. *\r
-// *\r
-//***************************************************************************\r
-\r
-#include "AliHLTTPCCAGPUTrackerNVCC.h"\r
-\r
-#ifdef HLTCA_GPUCODE\r
-#include <cuda.h>\r
-#include <sm_11_atomic_functions.h>\r
-#include <sm_12_atomic_functions.h>\r
-#endif\r
-\r
-#ifdef R__WIN32\r
-#else\r
-#include <sys/syscall.h>\r
-#include <semaphore.h>\r
-#include <fcntl.h>\r
-#endif\r
-\r
-#include "AliHLTTPCCADef.h"\r
-#include "AliHLTTPCCAGPUConfig.h"\r
-\r
-\r
-#include <iostream>\r
-\r
-//Disable assertions since they produce errors in GPU Code\r
-#ifdef assert\r
-#undef assert\r
-#endif\r
-#define assert(param)\r
-\r
-__constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];\r
-#ifdef HLTCA_GPU_TEXTURE_FETCH\r
-texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;\r
-texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;\r
-texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;\r
-#endif\r
-\r
-//Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code\r
-#include "AliHLTTPCCATrackParam.cxx"\r
-#include "AliHLTTPCCATrack.cxx" \r
-\r
-#include "AliHLTTPCCAHitArea.cxx"\r
-#include "AliHLTTPCCAGrid.cxx"\r
-#include "AliHLTTPCCARow.cxx"\r
-#include "AliHLTTPCCAParam.cxx"\r
-#include "AliHLTTPCCATracker.cxx"\r
-\r
-#include "AliHLTTPCCAProcess.h"\r
-\r
-#include "AliHLTTPCCATrackletSelector.cxx"\r
-#include "AliHLTTPCCANeighboursFinder.cxx"\r
-#include "AliHLTTPCCANeighboursCleaner.cxx"\r
-#include "AliHLTTPCCAStartHitsFinder.cxx"\r
-#include "AliHLTTPCCAStartHitsSorter.cxx"\r
-#include "AliHLTTPCCATrackletConstructor.cxx"\r
-\r
-#include "MemoryAssignmentHelpers.h"\r
-\r
-#ifndef HLTCA_STANDALONE\r
-#include "AliHLTDefinitions.h"\r
-#include "AliHLTSystem.h"\r
-#endif\r
-\r
-ClassImp( AliHLTTPCCAGPUTrackerNVCC )\r
-\r
-bool AliHLTTPCCAGPUTrackerNVCC::fgGPUUsed = false;\r
-\r
-#define SemLockName "AliceHLTTPCCAGPUTrackerInitLockSem"\r
-\r
-AliHLTTPCCAGPUTrackerNVCC::AliHLTTPCCAGPUTrackerNVCC() :\r
- fGpuTracker(NULL),\r
- fGPUMemory(NULL),\r
- fHostLockedMemory(NULL),\r
- fDebugLevel(0),\r
- fDebugMask(0xFFFFFFFF),\r
- fOutFile(NULL),\r
- fGPUMemSize(0),\r
- fpCudaStreams(NULL),\r
- fSliceCount(0),\r
- fOutputControl(NULL),\r
- fThreadId(0),\r
- fCudaInitialized(0),\r
- fPPMode(0)\r
- {};\r
-\r
-AliHLTTPCCAGPUTrackerNVCC::~AliHLTTPCCAGPUTrackerNVCC() {};\r
-\r
-void AliHLTTPCCAGPUTrackerNVCC::ReleaseGlobalLock(void* sem)\r
-{\r
- //Release the global named semaphore that locks GPU Initialization\r
-#ifdef R__WIN32\r
- HANDLE* h = (HANDLE*) sem;\r
- ReleaseSemaphore(*h, 1, NULL);\r
- CloseHandle(*h);\r
- delete h;\r
-#else\r
- sem_t* pSem = (sem_t*) sem;\r
- sem_post(pSem);\r
- sem_unlink(SemLockName);\r
-#endif\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::CheckMemorySizes(int sliceCount)\r
-{\r
- //Check constants for correct memory sizes\r
- if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)\r
- {\r
- HLTError("Insufficiant Tracker Object Memory");\r
- return(1);\r
- }\r
-\r
- if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)\r
- {\r
- HLTError("Insufficiant Common Memory");\r
- return(1);\r
- }\r
-\r
- if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)\r
- {\r
- HLTError("Insufficiant Row Memory");\r
- return(1);\r
- }\r
-\r
- if (fDebugLevel >= 3)\r
- {\r
- HLTInfo("Memory usage: Tracker Object %d / %d, Common Memory %d / %d, Row Memory %d / %d", sizeof(AliHLTTPCCATracker) * sliceCount, HLTCA_GPU_TRACKER_OBJECT_MEMORY, fgkNSlices * AliHLTTPCCATracker::CommonMemorySize(), HLTCA_GPU_COMMON_MEMORY, fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), HLTCA_GPU_ROWS_MEMORY);\r
- }\r
- return(0);\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::InitGPU(int sliceCount, int forceDeviceID)\r
-{\r
- //Find best CUDA device, initialize and allocate memory\r
-\r
- if (CheckMemorySizes(sliceCount)) return(1);\r
-\r
-#ifdef R__WIN32\r
- HANDLE* semLock = new HANDLE;\r
- *semLock = CreateSemaphore(NULL, 1, 1, SemLockName);\r
- if (*semLock == NULL)\r
- {\r
- HLTError("Error creating GPUInit Semaphore");\r
- return(1);\r
- }\r
- WaitForSingleObject(*semLock, INFINITE);\r
-#else\r
- sem_t* semLock = sem_open(SemLockName, O_CREAT, 0x01B6, 1);\r
- if (semLock == SEM_FAILED)\r
- {\r
- HLTError("Error creating GPUInit Semaphore");\r
- return(1);\r
- }\r
- sem_wait(semLock);\r
-#endif\r
-\r
- if (fgGPUUsed)\r
- {\r
- HLTWarning("CUDA already used by another AliHLTTPCCAGPUTracker running in same process");\r
- ReleaseGlobalLock(semLock);\r
- return(1);\r
- }\r
- fgGPUUsed = 1;\r
- fThreadId = GetThread();\r
-\r
- cudaDeviceProp fCudaDeviceProp;\r
-\r
- fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);\r
-\r
-#ifndef CUDA_DEVICE_EMULATION\r
- int count, bestDevice = -1;\r
- long long int bestDeviceSpeed = 0, deviceSpeed;\r
- if (CudaFailedMsg(cudaGetDeviceCount(&count)))\r
- {\r
- HLTError("Error getting CUDA Device Count");\r
- fgGPUUsed = 0;\r
- ReleaseGlobalLock(semLock);\r
- return(1);\r
- }\r
- if (fDebugLevel >= 2) HLTInfo("Available CUDA devices:");\r
- for (int i = 0;i < count;i++)\r
- {\r
- unsigned int free, total;\r
- cuInit(0);\r
- CUdevice tmpDevice;\r
- cuDeviceGet(&tmpDevice, i);\r
- CUcontext tmpContext;\r
- cuCtxCreate(&tmpContext, 0, tmpDevice);\r
- if(cuMemGetInfo(&free, &total)) std::cout << "Error\n";\r
- cuCtxDestroy(tmpContext);\r
- if (CudaFailedMsg(cudaGetDeviceProperties(&fCudaDeviceProp, i))) continue;\r
-\r
- int deviceOK = sliceCount <= fCudaDeviceProp.multiProcessorCount && fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && free >= fGPUMemSize;\r
-\r
- if (fDebugLevel >= 2) HLTInfo("%s%2d: %s (Rev: %d.%d - Mem Avail %d / %lld)%s", deviceOK ? " " : "[", i, fCudaDeviceProp.name, fCudaDeviceProp.major, fCudaDeviceProp.minor, free, (long long int) fCudaDeviceProp.totalGlobalMem, deviceOK ? "" : " ]");\r
- deviceSpeed = (long long int) fCudaDeviceProp.multiProcessorCount * (long long int) fCudaDeviceProp.clockRate * (long long int) fCudaDeviceProp.warpSize * (long long int) free;\r
- if (deviceOK && deviceSpeed > bestDeviceSpeed)\r
- {\r
- bestDevice = i;\r
- bestDeviceSpeed = deviceSpeed;\r
- }\r
- }\r
- if (bestDevice == -1)\r
- {\r
- HLTWarning("No CUDA Device available, aborting CUDA Initialisation");\r
- HLTInfo("Requiring Revision 1.3, Mem: %d", fGPUMemSize);\r
- fgGPUUsed = 0;\r
- ReleaseGlobalLock(semLock);\r
- return(1);\r
- }\r
-\r
- int cudaDevice;\r
- if (forceDeviceID == -1)\r
- cudaDevice = bestDevice;\r
- else\r
- cudaDevice = forceDeviceID;\r
-#else\r
- int cudaDevice = 0;\r
-#endif\r
-\r
- cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice ); \r
-\r
- if (fDebugLevel >= 1)\r
- {\r
- HLTInfo("Using CUDA Device %s with Properties:", fCudaDeviceProp.name);\r
- HLTInfo("totalGlobalMem = %lld", (unsigned long long int) fCudaDeviceProp.totalGlobalMem);\r
- HLTInfo("sharedMemPerBlock = %lld", (unsigned long long int) fCudaDeviceProp.sharedMemPerBlock);\r
- HLTInfo("regsPerBlock = %d", fCudaDeviceProp.regsPerBlock);\r
- HLTInfo("warpSize = %d", fCudaDeviceProp.warpSize);\r
- HLTInfo("memPitch = %lld", (unsigned long long int) fCudaDeviceProp.memPitch);\r
- HLTInfo("maxThreadsPerBlock = %d", fCudaDeviceProp.maxThreadsPerBlock);\r
- HLTInfo("maxThreadsDim = %d %d %d", fCudaDeviceProp.maxThreadsDim[0], fCudaDeviceProp.maxThreadsDim[1], fCudaDeviceProp.maxThreadsDim[2]);\r
- HLTInfo("maxGridSize = %d %d %d", fCudaDeviceProp.maxGridSize[0], fCudaDeviceProp.maxGridSize[1], fCudaDeviceProp.maxGridSize[2]);\r
- HLTInfo("totalConstMem = %lld", (unsigned long long int) fCudaDeviceProp.totalConstMem);\r
- HLTInfo("major = %d", fCudaDeviceProp.major);\r
- HLTInfo("minor = %d", fCudaDeviceProp.minor);\r
- HLTInfo("clockRate %d= ", fCudaDeviceProp.clockRate);\r
- HLTInfo("textureAlignment %lld= ", (unsigned long long int) fCudaDeviceProp.textureAlignment);\r
- }\r
-\r
- if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))\r
- {\r
- HLTError( "Unsupported CUDA Device" );\r
- fgGPUUsed = 0;\r
- ReleaseGlobalLock(semLock);\r
- return(1);\r
- }\r
-\r
- if (CudaFailedMsg(cudaSetDevice(cudaDevice)))\r
- {\r
- HLTError("Could not set CUDA Device!");\r
- fgGPUUsed = 0;\r
- ReleaseGlobalLock(semLock);\r
- return(1);\r
- }\r
-\r
- if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))\r
- {\r
- HLTError("CUDA Memory Allocation Error");\r
- cudaThreadExit();\r
- fgGPUUsed = 0;\r
- ReleaseGlobalLock(semLock);\r
- return(1);\r
- }\r
- ReleaseGlobalLock(semLock);\r
- if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);\r
- int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;\r
- if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))\r
- {\r
- cudaFree(fGPUMemory);\r
- cudaThreadExit();\r
- HLTError("Error allocating Page Locked Host Memory");\r
- fgGPUUsed = 0;\r
- return(1);\r
- }\r
- if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);\r
-\r
- if (fDebugLevel >= 1)\r
- {\r
- CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));\r
- }\r
-\r
- fSliceCount = sliceCount;\r
- //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory\r
- fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);\r
-\r
- for (int i = 0;i < fgkNSlices;i++)\r
- {\r
- fSlaveTrackers[i].SetGPUTracker();\r
- fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));\r
- fSlaveTrackers[i].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));\r
- }\r
-\r
- fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));\r
- cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;\r
- for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)\r
- {\r
- if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))\r
- {\r
- cudaFree(fGPUMemory);\r
- cudaFreeHost(fHostLockedMemory);\r
- cudaThreadExit();\r
- HLTError("Error creating CUDA Stream");\r
- fgGPUUsed = 0;\r
- return(1);\r
- }\r
- }\r
-\r
- fCudaInitialized = 1;\r
- HLTImportant("CUDA Initialisation successfull (Device %d: %s, Thread %d, Max slices: %d)", cudaDevice, fCudaDeviceProp.name, fThreadId, fSliceCount);\r
-\r
-#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)\r
- if (fDebugLevel < 2)\r
- {\r
- //Do one initial run for Benchmark reasons\r
- const int useDebugLevel = fDebugLevel;\r
- fDebugLevel = 0;\r
- AliHLTTPCCAClusterData* tmpCluster = new AliHLTTPCCAClusterData[sliceCount];\r
-\r
- std::ifstream fin;\r
-\r
- AliHLTTPCCAParam tmpParam;\r
- AliHLTTPCCASliceOutput::outputControlStruct tmpOutputControl;\r
-\r
- fin.open("events/settings.dump");\r
- int tmpCount;\r
- fin >> tmpCount;\r
- for (int i = 0;i < sliceCount;i++)\r
- {\r
- fSlaveTrackers[i].SetOutputControl(&tmpOutputControl);\r
- tmpParam.ReadSettings(fin);\r
- InitializeSliceParam(i, tmpParam);\r
- }\r
- fin.close();\r
-\r
- fin.open("eventspbpbc/event.0.dump", std::ifstream::binary);\r
- for (int i = 0;i < sliceCount;i++)\r
- {\r
- tmpCluster[i].StartReading(i, 0);\r
- tmpCluster[i].ReadEvent(fin);\r
- tmpCluster[i].FinishReading();\r
- }\r
- fin.close();\r
-\r
- AliHLTTPCCASliceOutput **tmpOutput = new AliHLTTPCCASliceOutput*[sliceCount];\r
- memset(tmpOutput, 0, sliceCount * sizeof(AliHLTTPCCASliceOutput*));\r
-\r
- Reconstruct(tmpOutput, tmpCluster, 0, sliceCount);\r
- for (int i = 0;i < sliceCount;i++)\r
- {\r
- free(tmpOutput[i]);\r
- tmpOutput[i] = NULL;\r
- fSlaveTrackers[i].SetOutputControl(NULL);\r
- }\r
- delete[] tmpOutput;\r
- delete[] tmpCluster;\r
- fDebugLevel = useDebugLevel;\r
- }\r
-#endif\r
- return(0);\r
-}\r
-\r
-template <class T> inline T* AliHLTTPCCAGPUTrackerNVCC::alignPointer(T* ptr, int alignment)\r
-{\r
- //Macro to align Pointers.\r
- //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker\r
- //(As long as every single data structure is <= 1 MB)\r
-\r
- size_t adr = (size_t) ptr;\r
- if (adr % alignment)\r
- {\r
- adr += alignment - (adr % alignment);\r
- }\r
- return((T*) adr);\r
-}\r
-\r
-bool AliHLTTPCCAGPUTrackerNVCC::CudaFailedMsg(cudaError_t error)\r
-{\r
- //Check for CUDA Error and in the case of an error display the corresponding error string\r
- if (error == cudaSuccess) return(false);\r
- HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));\r
- return(true);\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::CUDASync(char* state, int sliceLocal, int slice)\r
-{\r
- //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards\r
-\r
- if (fDebugLevel == 0) return(0);\r
- cudaError cuErr;\r
- cuErr = cudaGetLastError();\r
- if (cuErr != cudaSuccess)\r
- {\r
- HLTError("Cuda Error %s while running kernel (%s) (Slice %d; %d/%d)", cudaGetErrorString(cuErr), state, sliceLocal, slice, fgkNSlices);\r
- return(1);\r
- }\r
- if (CudaFailedMsg(cudaThreadSynchronize()))\r
- {\r
- HLTError("CUDA Error while synchronizing (%s) (Slice %d; %d/%d)", state, sliceLocal, slice, fgkNSlices);\r
- return(1);\r
- }\r
- if (fDebugLevel >= 3) HLTInfo("CUDA Sync Done");\r
- return(0);\r
-}\r
-\r
-void AliHLTTPCCAGPUTrackerNVCC::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)\r
-{\r
- //Set Debug Level and Debug output File if applicable\r
- fDebugLevel = dwLevel;\r
- if (NewOutFile) fOutFile = NewOutFile;\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::SetGPUTrackerOption(char* OptionName, int OptionValue)\r
-{\r
- //Set a specific GPU Tracker Option\r
- if (strcmp(OptionName, "PPMode") == 0)\r
- {\r
- fPPMode = OptionValue;\r
- }\r
- else if (strcmp(OptionName, "DebugMask") == 0)\r
- {\r
- fDebugMask = OptionValue;\r
- }\r
- else\r
- {\r
- HLTError("Unknown Option: %s", OptionName);\r
- return(1);\r
- }\r
- return(0);\r
-}\r
-\r
-#ifdef HLTCA_STANDALONE\r
-void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int iSlice, int i)\r
-{\r
- //Run Performance Query for timer i of slice iSlice\r
- if (fDebugLevel >= 1)\r
- {\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));\r
- }\r
-}\r
-#else\r
-void AliHLTTPCCAGPUTrackerNVCC::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}\r
-#endif\r
-\r
-void AliHLTTPCCAGPUTrackerNVCC::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)\r
-{\r
- //Dump Rowblocks to File\r
- if (fDebugLevel >= 4)\r
- {\r
- *fOutFile << "RowBlock Tracklets (Slice " << tracker[iSlice].Param().ISlice() << " (" << iSlice << " of reco))";\r
- *fOutFile << " after Tracklet Reconstruction";\r
- *fOutFile << std::endl;\r
- \r
- int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);\r
- int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);\r
- uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);\r
- CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));\r
- CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));\r
- CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));\r
- CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
-\r
- int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;\r
- for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)\r
- {\r
- *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << \r
- rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << std::endl << "Phase 1: ";\r
- for (int j = 0;j < rowBlockPos[i].x;j++)\r
- {\r
- //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!\r
- *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";\r
-#ifdef HLTCA_GPU_SCHED_FIXED_START\r
- if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)\r
- {\r
- HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);\r
- }\r
-#endif //HLTCA_GPU_SCHED_FIXED_START\r
- k++;\r
- if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)\r
- {\r
- HLTError("Error, -1 Tracklet found");\r
- }\r
- }\r
- *fOutFile << std::endl << "Phase 2: ";\r
- for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)\r
- {\r
- *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";\r
- }\r
- *fOutFile << std::endl;\r
- }\r
-\r
- if (check)\r
- {\r
- *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;\r
- for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)\r
- {\r
- *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;\r
- }\r
- }\r
-\r
- free(rowBlockPos);\r
- free(rowBlockTracklets);\r
- free(blockStartingTracklet);\r
- }\r
-}\r
-\r
-__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)\r
-{\r
- //Initialize GPU RowBlocks and HitWeights\r
- int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;\r
- int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;\r
- const int stride = blockDim.x * gridDim.x;\r
- int4 i0, i1;\r
- i0.x = i0.y = i0.z = i0.w = 0;\r
- i1.x = i1.y = i1.z = i1.w = -1;\r
- for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)\r
- RowBlockPos[i] = i0;\r
- for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)\r
- rowBlockTracklets4[i] = i1;\r
- for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)\r
- sliceDataHitWeights4[i] = i0;\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::Reconstruct(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)\r
-{\r
- //Primary reconstruction function\r
-\r
- cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;\r
-\r
- if (sliceCountLocal == -1) sliceCountLocal = fSliceCount;\r
- \r
- if (!fCudaInitialized)\r
- {\r
- HLTError("GPUTracker not initialized");\r
- return(1);\r
- }\r
- if (sliceCountLocal > fSliceCount)\r
- {\r
- HLTError("GPU Tracker was initialized to run with %d slices but was called to process %d slices", fSliceCount, sliceCountLocal);\r
- return(1);\r
- }\r
- if (fThreadId != GetThread())\r
- {\r
- HLTError("GPUTracker context was initialized by different thread, Initializing Thread: %d, Processing Thread: %d", fThreadId, GetThread());\r
- return(1);\r
- }\r
-\r
- if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice].Param().ISlice() + sliceCountLocal);\r
-\r
- if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)\r
- {\r
- HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));\r
- return(1);\r
- }\r
-\r
- if (fPPMode) return(ReconstructPP(pOutput, pClusterData, firstSlice, sliceCountLocal));\r
-\r
- memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");\r
-\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
- unsigned long long int a, b, c, d;\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);\r
-#endif\r
- \r
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
- {\r
- //Make this a GPU Tracker\r
- fGpuTracker[iSlice].SetGPUTracker();\r
- fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));\r
- fGpuTracker[iSlice].SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));\r
- fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);\r
-\r
- //Set Pointers to GPU Memory\r
- char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");\r
- tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());\r
- tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");\r
- tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);\r
- tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");\r
- tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());\r
- tmpMem = alignPointer(tmpMem, 1024 * 1024);\r
-\r
- if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)\r
- {\r
- HLTError("Insufficiant Track Memory");\r
- return(1);\r
- }\r
-\r
- if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)\r
- {\r
- HLTError("Insufficiant Global Memory");\r
- return(1);\r
- }\r
-\r
- if (fDebugLevel >= 3)\r
- {\r
- HLTInfo("GPU Global Memory Used: %d/%d, Page Locked Tracks Memory used: %d / %d", tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice), HLTCA_GPU_GLOBAL_MEMORY, fGpuTracker[iSlice].TrackMemorySize(), HLTCA_GPU_TRACKS_MEMORY);\r
- }\r
-\r
- //Initialize Startup Constants\r
- *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;\r
- *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;\r
- *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;\r
- fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;\r
- if (fDebugLevel >= 3) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);\r
- fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;\r
- fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;\r
- fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;\r
- fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());\r
- }\r
-\r
-#ifdef HLTCA_GPU_TEXTURE_FETCH\r
- cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();\r
- size_t offset;\r
- if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
- {\r
- HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);\r
- return(1);\r
- }\r
- cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();\r
- if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
- {\r
- HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);\r
- return(1);\r
- }\r
- cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();\r
- if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
- {\r
- HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);\r
- return(1);\r
- }\r
-#endif\r
-\r
- //Copy Tracker Object to GPU Memory\r
- if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");\r
-#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE\r
- char* tmpMem;\r
- if (CudaFailedMsg(cudaMalloc(&tmpMem, 100000000))) return(1);\r
- fGpuTracker[0].fStageAtSync = tmpMem;\r
- CudaFailedMsg(cudaMemset(fGpuTracker[0].StageAtSync(), 0, 100000000));\r
-#endif\r
- CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));\r
-\r
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
- {\r
- StandalonePerfTime(firstSlice + iSlice, 0);\r
-\r
- //Initialize GPU Slave Tracker\r
- if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");\r
- fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
-#endif\r
- fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);\r
-\r
- if (fDebugLevel >= 4)\r
- {\r
- *fOutFile << std::endl << std::endl << "Reconstruction: " << iSlice << "/" << sliceCountLocal << " Total Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << " / " << fgkNSlices << std::endl;\r
-\r
- if (fDebugMask & 1) fSlaveTrackers[firstSlice + iSlice].DumpSliceData(*fOutFile);\r
- }\r
-\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
- printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
-#endif\r
- if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)\r
- {\r
- HLTError("Insufficiant Slice Data Memory");\r
- return(1);\r
- }\r
-\r
- if (fDebugLevel >= 3)\r
- {\r
- HLTInfo("GPU Slice Data Memory Used: %d/%d", fSlaveTrackers[firstSlice + iSlice].Data().MemorySize(), HLTCA_GPU_SLICE_DATA_MEMORY);\r
- }\r
-\r
- //Initialize temporary memory where needed\r
- if (fDebugLevel >= 3) HLTInfo("Copying Slice Data to GPU and initializing temporary memory"); \r
- PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());\r
-\r
- //Copy Data to GPU Global Memory\r
- CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
- CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
- CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));\r
-\r
- if (fDebugLevel >= 4)\r
- {\r
- if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");\r
- fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTrackletsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] ), HLTCA_GPU_MAX_TRACKLETS);\r
- fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerHitsMemory(reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100]), pClusterData[iSlice].NumberOfClusters() );\r
- }\r
- \r
- if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);\r
- StandalonePerfTime(firstSlice + iSlice, 1);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder (Slice %d/%d)", iSlice, sliceCountLocal);\r
- AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
-\r
- if (CUDASync("Neighbours finder", iSlice, iSlice + firstSlice)) return 1;\r
-\r
- StandalonePerfTime(firstSlice + iSlice, 2);\r
-\r
- if (fDebugLevel >= 4)\r
- {\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));\r
- if (fDebugMask & 2) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);\r
- }\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner (Slice %d/%d)", iSlice, sliceCountLocal);\r
- AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
- if (CUDASync("Neighbours Cleaner", iSlice, iSlice + firstSlice)) return 1;\r
-\r
- StandalonePerfTime(firstSlice + iSlice, 3);\r
-\r
- if (fDebugLevel >= 4)\r
- {\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));\r
- if (fDebugMask & 4) fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);\r
- }\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder (Slice %d/%d)", iSlice, sliceCountLocal);\r
- AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
- if (CUDASync("Start Hits Finder", iSlice, iSlice + firstSlice)) return 1;\r
-\r
- StandalonePerfTime(firstSlice + iSlice, 4);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Sorter (Slice %d/%d)", iSlice, sliceCountLocal);\r
- AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);\r
- if (CUDASync("Start Hits Sorter", iSlice, iSlice + firstSlice)) return 1;\r
-\r
- StandalonePerfTime(firstSlice + iSlice, 5);\r
-\r
- if (fDebugLevel >= 2)\r
- {\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
- if (fDebugLevel >= 3) HLTInfo("Obtaining Number of Start Hits from GPU: %d (Slice %d)", *fSlaveTrackers[firstSlice + iSlice].NTracklets(), iSlice);\r
- if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)\r
- {\r
- HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");\r
- return(1);\r
- }\r
- }\r
-\r
- if (fDebugLevel >= 4)\r
- {\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));\r
- if (fDebugMask & 8)\r
- {\r
- *fOutFile << "Temporary ";\r
- fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);\r
- }\r
- uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());\r
- CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));\r
- if (fDebugMask & 16)\r
- {\r
- *fOutFile << "Start Hits Sort Vector:" << std::endl;\r
- for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)\r
- {\r
- *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;\r
- }\r
- }\r
- free(tmpMemory);\r
-\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));\r
- if (fDebugMask & 32) fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);\r
- }\r
-\r
- StandalonePerfTime(firstSlice + iSlice, 6);\r
- \r
- fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());\r
- }\r
-\r
- StandalonePerfTime(firstSlice, 7);\r
-#ifdef HLTCA_GPU_PREFETCHDATA\r
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
- {\r
- if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))\r
- {\r
- HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));\r
- return(1);\r
- }\r
- if (fDebugLevel >= 1)\r
- {\r
- static int infoShown = 0;\r
- if (!infoShown)\r
- {\r
- HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));\r
- infoShown = 1;\r
- }\r
- }\r
- }\r
-#endif\r
-\r
- int nHardCollisions = 0;\r
-\r
-RestartTrackletConstructor:\r
- if (fDebugLevel >= 3) HLTInfo("Initialising Tracklet Constructor Scheduler");\r
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
- {\r
- AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);\r
- if (CUDASync("Tracklet Initializer", iSlice, iSlice + firstSlice)) return 1;\r
- if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice);\r
- }\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");\r
- AliHLTTPCCATrackletConstructorGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();\r
- if (CUDASync("Tracklet Constructor", 0, firstSlice)) return 1;\r
- \r
- StandalonePerfTime(firstSlice, 8);\r
-\r
- if (fDebugLevel >= 4)\r
- {\r
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
- {\r
- if (fDebugMask & 64) DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
- if (fDebugLevel >= 5)\r
- {\r
- HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());\r
- }\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));\r
- if (fDebugMask & 128) fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);\r
- }\r
- }\r
-\r
- int runSlices = 0;\r
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += runSlices)\r
- {\r
- if (runSlices < HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT) runSlices++;\r
- if (fDebugLevel >= 3) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + runSlices);\r
- AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(runSlices, sliceCountLocal - iSlice));\r
- if (CUDASync("Tracklet Selector", iSlice, iSlice + firstSlice)) return 1;\r
- }\r
- StandalonePerfTime(firstSlice, 9);\r
-\r
- int tmpSlice = 0, tmpSlice2 = 0;\r
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
- {\r
- if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");\r
-\r
- while(tmpSlice < sliceCountLocal && (tmpSlice == iSlice || cudaStreamQuery(cudaStreams[tmpSlice]) == CUDA_SUCCESS))\r
- {\r
- if (CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemory(), fGpuTracker[tmpSlice].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice]))) return(1);\r
- tmpSlice++;\r
- }\r
-\r
- while (tmpSlice2 < tmpSlice && (tmpSlice2 == iSlice ? cudaStreamSynchronize(cudaStreams[tmpSlice2]) : cudaStreamQuery(cudaStreams[tmpSlice2])) == CUDA_SUCCESS)\r
- {\r
- CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].Tracks(), fGpuTracker[tmpSlice2].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + tmpSlice2].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));\r
- CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + tmpSlice2].TrackHits(), fGpuTracker[tmpSlice2].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + tmpSlice2].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[tmpSlice2]));\r
- tmpSlice2++;\r
- }\r
-\r
- cudaStreamSynchronize(cudaStreams[iSlice]);\r
-\r
- if (fDebugLevel >= 4)\r
- {\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));\r
- if (fDebugMask & 256) fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);\r
- if (fDebugMask & 512) fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);\r
- }\r
-\r
- if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)\r
- {\r
- if ((fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION || fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_WRONG_ROW)&& nHardCollisions++ < 10)\r
- {\r
- if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError == HLTCA_GPU_ERROR_SCHEDULE_COLLISION)\r
- {\r
- HLTWarning("Hard scheduling collision occured, rerunning Tracklet Constructor");\r
- }\r
- else\r
- {\r
- HLTWarning("Tracklet Constructor returned invalid row");\r
- }\r
- for (int i = 0;i < sliceCountLocal;i++)\r
- {\r
- cudaThreadSynchronize();\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyDeviceToHost));\r
- *fSlaveTrackers[firstSlice + i].NTracks() = 0;\r
- *fSlaveTrackers[firstSlice + i].NTrackHits() = 0;\r
- fSlaveTrackers[firstSlice + i].GPUParameters()->fGPUError = HLTCA_GPU_ERROR_NONE;\r
- CudaFailedMsg(cudaMemcpy(fGpuTracker[i].CommonMemory(), fSlaveTrackers[firstSlice + i].CommonMemory(), fGpuTracker[i].CommonMemorySize(), cudaMemcpyHostToDevice));\r
- PreInitRowBlocks<<<30, 256>>>(fGpuTracker[i].RowBlockPos(), fGpuTracker[i].RowBlockTracklets(), fGpuTracker[i].Data().HitWeights(), fSlaveTrackers[firstSlice + i].Data().NumberOfHitsPlusAlign());\r
- }\r
- goto RestartTrackletConstructor;\r
- }\r
- HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);\r
- return(1);\r
- }\r
- if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());\r
-\r
- fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
-#endif\r
- fSlaveTrackers[firstSlice + iSlice].WriteOutput();\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
- printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
-#endif\r
-\r
- if (fDebugLevel >= 4)\r
- {\r
- delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();\r
- delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();\r
- }\r
- }\r
-\r
- StandalonePerfTime(firstSlice, 10);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");\r
-\r
-#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE\r
- char* stageAtSync = (char*) malloc(100000000);\r
- CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].StageAtSync(), 100 * 1000 * 1000, cudaMemcpyDeviceToHost));\r
- cudaFree(fGpuTracker[0].StageAtSync());\r
-\r
- FILE* fp = fopen("profile.txt", "w+");\r
- FILE* fp2 = fopen("profile.bmp", "w+b");\r
- int nEmptySync = 0, fEmpty;\r
-\r
- const int bmpheight = 1000;\r
- BITMAPFILEHEADER bmpFH;\r
- BITMAPINFOHEADER bmpIH;\r
- ZeroMemory(&bmpFH, sizeof(bmpFH));\r
- ZeroMemory(&bmpIH, sizeof(bmpIH));\r
- \r
- bmpFH.bfType = 19778; //"BM"\r
- bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;\r
- bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);\r
-\r
- bmpIH.biSize = sizeof(bmpIH);\r
- bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;\r
- bmpIH.biHeight = bmpheight;\r
- bmpIH.biPlanes = 1;\r
- bmpIH.biBitCount = 32;\r
-\r
- fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);\r
- fwrite(&bmpIH, 1, sizeof(bmpIH), fp2); \r
-\r
- for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)\r
- {\r
- fEmpty = 1;\r
- for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)\r
- {\r
- fprintf(fp, "%d\t", stageAtSync[i + j]);\r
- int color = 0;\r
- if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);\r
- if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);\r
- if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);\r
- if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);\r
- fwrite(&color, 1, sizeof(int), fp2);\r
- if (j > 0 && j % 32 == 0)\r
- {\r
- color = RGB(255, 255, 255);\r
- fwrite(&color, 1, 4, fp2);\r
- }\r
- if (stageAtSync[i + j]) fEmpty = 0;\r
- }\r
- fprintf(fp, "\n");\r
- if (fEmpty) nEmptySync++;\r
- else nEmptySync = 0;\r
- //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;\r
- }\r
-\r
- fclose(fp);\r
- fclose(fp2);\r
- free(stageAtSync);\r
-#endif \r
-\r
- return(0);\r
-}\r
-\r
-__global__ void ClearPPHitWeights(int sliceCount)\r
-{\r
- //Clear HitWeights\r
- \r
- for (int k = 0;k < sliceCount;k++)\r
- {\r
- AliHLTTPCCATracker &tracker = ((AliHLTTPCCATracker*) gAliHLTTPCCATracker)[k];\r
- int4* const pHitWeights = (int4*) tracker.Data().HitWeights();\r
- const int dwCount = tracker.Data().NumberOfHitsPlusAlign();\r
- const int stride = blockDim.x * gridDim.x;\r
- int4 i0;\r
- i0.x = i0.y = i0.z = i0.w = 0;\r
- \r
- for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < dwCount * sizeof(int) / sizeof(int4);i += stride)\r
- {\r
- pHitWeights[i] = i0;\r
- }\r
- }\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::ReconstructPP(AliHLTTPCCASliceOutput** pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)\r
-{\r
- //Primary reconstruction function for small events (PP)\r
-\r
- memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Allocating GPU Tracker memory and initializing constants");\r
-\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
- unsigned long long int a, b, c, d;\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(&c);\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&d);\r
-#endif\r
-\r
- char* tmpSliceMemHost = (char*) SliceDataMemory(fHostLockedMemory, 0);\r
- char* tmpSliceMemGpu = (char*) SliceDataMemory(fGPUMemory, 0);\r
-\r
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
- {\r
- StandalonePerfTime(firstSlice + iSlice, 0);\r
-\r
- //Initialize GPU Slave Tracker\r
- if (fDebugLevel >= 3) HLTInfo("Creating Slice Data");\r
- fSlaveTrackers[firstSlice + iSlice].SetGPUSliceDataMemory(tmpSliceMemHost, RowMemory(fHostLockedMemory, firstSlice + iSlice));\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
-#endif\r
- fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
- printf("Read %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
-#endif\r
- if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)\r
- {\r
- HLTError("Insufficiant Slice Data Memory");\r
- return(1);\r
- }\r
-\r
- //Make this a GPU Tracker\r
- fGpuTracker[iSlice].SetGPUTracker();\r
- fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));\r
-\r
-\r
- fGpuTracker[iSlice].SetGPUSliceDataMemory(tmpSliceMemGpu, RowMemory(fGPUMemory, iSlice));\r
- fGpuTracker[iSlice].SetPointersSliceData(&pClusterData[iSlice], false);\r
-\r
- tmpSliceMemHost += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();\r
- tmpSliceMemHost = alignPointer(tmpSliceMemHost, 64 * 1024);\r
- tmpSliceMemGpu += fSlaveTrackers[firstSlice + iSlice].Data().MemorySize();\r
- tmpSliceMemGpu = alignPointer(tmpSliceMemGpu, 64 * 1024);\r
-\r
-\r
- //Set Pointers to GPU Memory\r
- char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Initialising GPU Hits Memory");\r
- tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());\r
- tmpMem = alignPointer(tmpMem, 64 * 1024);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Initialising GPU Tracklet Memory");\r
- tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);\r
- tmpMem = alignPointer(tmpMem, 64 * 1024);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Initialising GPU Track Memory");\r
- tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());\r
- tmpMem = alignPointer(tmpMem, 64 * 1024);\r
-\r
- if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)\r
- {\r
- HLTError("Insufficiant Track Memory");\r
- return(1);\r
- }\r
-\r
- if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)\r
- {\r
- HLTError("Insufficiant Global Memory");\r
- return(1);\r
- }\r
-\r
- //Initialize Startup Constants\r
- *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;\r
- *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;\r
- *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;\r
- fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;\r
-\r
- fGpuTracker[iSlice].SetGPUTextureBase(fGpuTracker[0].Data().Memory());\r
-\r
- if (CUDASync("Initialization", iSlice, iSlice + firstSlice)) return(1);\r
- StandalonePerfTime(firstSlice + iSlice, 1);\r
- }\r
-\r
-#ifdef HLTCA_GPU_TEXTURE_FETCH\r
- cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();\r
- size_t offset;\r
- if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
- {\r
- HLTError("Error binding CUDA Texture ushort2 (Offset %d)", (int) offset);\r
- return(1);\r
- }\r
- cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();\r
- if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
- {\r
- HLTError("Error binding CUDA Texture ushort (Offset %d)", (int) offset);\r
- return(1);\r
- }\r
- cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();\r
- if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
- {\r
- HLTError("Error binding CUDA Texture short (Offset %d)", (int) offset);\r
- return(1);\r
- }\r
-#endif\r
-\r
- //Copy Tracker Object to GPU Memory\r
- if (fDebugLevel >= 3) HLTInfo("Copying Tracker objects to GPU");\r
- CudaFailedMsg(cudaMemcpyToSymbol(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice));\r
-\r
- //Copy Data to GPU Global Memory\r
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
- {\r
- CudaFailedMsg(cudaMemcpy(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice));\r
- //printf("%lld %lld %d %d\n", (size_t) (char*) fGpuTracker[iSlice].Data().Memory(), (size_t) (char*) fSlaveTrackers[firstSlice + iSlice].Data().Memory(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), (int) (size_t) fSlaveTrackers[firstSlice + iSlice].Data().MemorySize());\r
- }\r
- //CudaFailedMsg(cudaMemcpy(SliceDataMemory(fGPUMemory, 0), SliceDataMemory(fHostLockedMemory, 0), tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0), cudaMemcpyHostToDevice));\r
- //printf("%lld %lld %d\n", (size_t) (char*) SliceDataMemory(fGPUMemory, 0), (size_t) (char*) SliceDataMemory(fHostLockedMemory, 0), (int) (size_t) (tmpSliceMemHost - (char*) SliceDataMemory(fHostLockedMemory, 0)));\r
- CudaFailedMsg(cudaMemcpy(fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyHostToDevice));\r
- CudaFailedMsg(cudaMemcpy(fGpuTracker[0].SliceDataRows(), fSlaveTrackers[firstSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) * sliceCountLocal, cudaMemcpyHostToDevice));\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Finder");\r
- AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursFinder> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows());\r
- if (CUDASync("Neighbours finder", 0, firstSlice)) return 1;\r
- StandalonePerfTime(firstSlice, 2);\r
- if (fDebugLevel >= 3) HLTInfo("Running GPU Neighbours Cleaner");\r
- AliHLTTPCCAProcessMultiA<AliHLTTPCCANeighboursCleaner> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 2);\r
- if (CUDASync("Neighbours Cleaner", 0, firstSlice)) return 1;\r
- StandalonePerfTime(firstSlice, 3);\r
- if (fDebugLevel >= 3) HLTInfo("Running GPU Start Hits Finder");\r
- AliHLTTPCCAProcessMultiA<AliHLTTPCCAStartHitsFinder> <<<30, 256>>>(0, sliceCountLocal, fSlaveTrackers[firstSlice].Param().NRows() - 6);\r
- if (CUDASync("Start Hits Finder", 0, firstSlice)) return 1;\r
- StandalonePerfTime(firstSlice, 4);\r
-\r
- ClearPPHitWeights <<<30, 256>>>(sliceCountLocal);\r
- if (CUDASync("Clear Hit Weights", 0, firstSlice)) return 1;\r
-\r
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
- {\r
- fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());\r
- }\r
-\r
- StandalonePerfTime(firstSlice, 7);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("Running GPU Tracklet Constructor");\r
- AliHLTTPCCATrackletConstructorGPUPP<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal);\r
- if (CUDASync("Tracklet Constructor PP", 0, firstSlice)) return 1;\r
- \r
- StandalonePerfTime(firstSlice, 8);\r
-\r
- AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>(0, sliceCountLocal);\r
- if (CUDASync("Tracklet Selector", 0, firstSlice)) return 1;\r
- StandalonePerfTime(firstSlice, 9);\r
-\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice].CommonMemory(), fGpuTracker[0].CommonMemory(), fSlaveTrackers[firstSlice].CommonMemorySize() * sliceCountLocal, cudaMemcpyDeviceToHost));\r
-\r
- for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)\r
- {\r
- if (fDebugLevel >= 3) HLTInfo("Transfering Tracks from GPU to Host");\r
-\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost));\r
- CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost));\r
-\r
- if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)\r
- {\r
- HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);\r
- return(1);\r
- }\r
- if (fDebugLevel >= 3) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());\r
-\r
- fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&a);\r
-#endif\r
- fSlaveTrackers[firstSlice + iSlice].WriteOutput();\r
-#ifdef HLTCA_GPU_TIME_PROFILE\r
- AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&b);\r
- printf("Write %f %f\n", ((double) b - (double) a) / (double) c, ((double) a - (double) d) / (double) c);\r
-#endif\r
- }\r
-\r
- StandalonePerfTime(firstSlice, 10);\r
-\r
- if (fDebugLevel >= 3) HLTInfo("GPU Reconstruction finished");\r
-\r
- return(0);\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::InitializeSliceParam(int iSlice, AliHLTTPCCAParam ¶m)\r
-{\r
- //Initialize Slice Tracker Parameter for a slave tracker\r
- fSlaveTrackers[iSlice].Initialize(param);\r
- if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)\r
- {\r
- HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);\r
- return(1);\r
- }\r
- return(0);\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::ExitGPU()\r
-{\r
- //Uninitialize CUDA\r
- cudaThreadSynchronize();\r
- if (fGPUMemory)\r
- {\r
- cudaFree(fGPUMemory);\r
- fGPUMemory = NULL;\r
- }\r
- if (fHostLockedMemory)\r
- {\r
- for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)\r
- {\r
- cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);\r
- }\r
- free(fpCudaStreams);\r
- fGpuTracker = NULL;\r
- cudaFreeHost(fHostLockedMemory);\r
- }\r
-\r
- if (CudaFailedMsg(cudaThreadExit()))\r
- {\r
- HLTError("Could not uninitialize GPU");\r
- return(1);\r
- }\r
- HLTInfo("CUDA Uninitialized");\r
- fgGPUUsed = false;\r
- fCudaInitialized = 0;\r
- return(0);\r
-}\r
-\r
-void AliHLTTPCCAGPUTrackerNVCC::SetOutputControl( AliHLTTPCCASliceOutput::outputControlStruct* val)\r
-{\r
- //Set Output Control Pointers\r
- fOutputControl = val;\r
- for (int i = 0;i < fgkNSlices;i++)\r
- {\r
- fSlaveTrackers[i].SetOutputControl(val);\r
- }\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::GetThread()\r
-{\r
- //Get Thread ID\r
-#ifdef R__WIN32\r
- return((int) (size_t) GetCurrentThread());\r
-#else\r
- return((int) syscall (SYS_gettid));\r
-#endif\r
-}\r
-\r
-unsigned long long int* AliHLTTPCCAGPUTrackerNVCC::PerfTimer(int iSlice, unsigned int i)\r
-{\r
- //Returns pointer to PerfTimer i of slice iSlice\r
- return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL);\r
-}\r
-\r
-const AliHLTTPCCASliceOutput::outputControlStruct* AliHLTTPCCAGPUTrackerNVCC::OutputControl() const\r
-{\r
- //Return Pointer to Output Control Structure\r
- return fOutputControl;\r
-}\r
-\r
-int AliHLTTPCCAGPUTrackerNVCC::GetSliceCount() const\r
-{\r
- //Return max slice count processable\r
- return(fSliceCount);\r
-}\r
-\r
-AliHLTTPCCAGPUTracker* AliHLTTPCCAGPUTrackerNVCCCreate()\r
-{\r
- return new AliHLTTPCCAGPUTrackerNVCC;\r
-}\r
-\r
-void AliHLTTPCCAGPUTrackerNVCCDestroy(AliHLTTPCCAGPUTracker* ptr)\r
-{\r
- delete ptr;\r
-}\r
-\r
#include "AliHLTTPCCADef.h"
#include "AliHLTTPCCATracklet.h"
#include "AliHLTTPCCATrackletConstructor.h"
-#include "MemoryAssignmentHelpers.h"
-
-//#include "AliHLTTPCCAPerformance.h"
-//#include "TH1D.h"
-
-//#define DRAW
-
-#ifdef DRAW
-#include "AliHLTTPCCADisplay.h"
-#endif //DRAW
#define kMaxRowGap 4
tParam.InitParam();
}
-GPUdi() void AliHLTTPCCATrackletConstructor::ReadData
-#ifndef HLTCA_GPU_PREFETCHDATA
-( int /*iThread*/, AliHLTTPCCASharedMemory& /*s*/, AliHLTTPCCAThreadMemory& /*r*/, AliHLTTPCCATracker& /*tracker*/, int /*iRow*/ )
-{
- //Prefetch Data to shared memory
-#else
-( int iThread, AliHLTTPCCASharedMemory& s, AliHLTTPCCAThreadMemory& r, AliHLTTPCCATracker& tracker, int iRow )
-{
- // reconstruction of tracklets, read data step
- const AliHLTTPCCARow &row = tracker.Row( iRow );
- //bool jr = !r.fCurrentData;
-
- // copy hits, grid content and links
-
- // FIXME: inefficient copy
- //const int numberOfHitsAligned = NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits());
-
-/*
-#ifdef HLTCA_GPU_REORDERHITDATA
- ushort2 *sMem1 = reinterpret_cast<ushort2 *>( s.fData[jr] );
- for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
- sMem1[i].x = tracker.HitDataY( row, i );
- sMem1[i].y = tracker.HitDataZ( row, i );
- }
-#else
- ushort_v *sMem1 = reinterpret_cast<ushort_v *>( s.fData[jr] );
- for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
- sMem1[i] = tracker.HitDataY( row, i );
- }
-
- ushort_v *sMem1a = reinterpret_cast<ushort_v *>( s.fData[jr] ) + numberOfHitsAligned;
- for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
- sMem1a[i] = tracker.HitDataZ( row, i );
- }
-#endif //HLTCA_GPU_REORDERHITDATA
-
- short *sMem2 = reinterpret_cast<short *>( s.fData[jr] ) + 2 * numberOfHitsAligned;
- for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
- sMem2[i] = tracker.HitLinkUpData( row, i );
- }
-
- unsigned short *sMem3 = reinterpret_cast<unsigned short *>( s.fData[jr] ) + 3 * numberOfHitsAligned;
- const int n = row.FullSize(); // + grid content size
- for ( int i = iThread; i < n; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
- sMem3[i] = tracker.FirstHitInBin( row, i );
- }*/
-
- /*for (int k = 0;k < 2;k++)
- {
- HLTCA_GPU_ROWCOPY* sharedMem;
- const HLTCA_GPU_ROWCOPY* sourceMem;
- int copyCount;
- switch (k)
- {
- case 0:
- sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.HitDataY(row) );
- sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
- copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
- break;
- case 1:
- sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.HitDataZ(row) );
- sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
- copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
- break;
- case 2:
- sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.HitLinkUpData(row) );
- sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
- copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
- break;
- case 1:
- sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.FirstHitInBin(row) );
- sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
- copyCount = NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.FullSize()) * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
- break;
- }
- for (int i = iThread;i < copyCount;i += TRACKLET_CONSTRUCTOR_NMEMTHREDS)
- {
- sharedMem[i] = sourceMem[i];
- }
- }*/
-
- for (unsigned int i = iThread;i < row.FullSize() * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);i += TRACKLET_CONSTRUCTOR_NMEMTHREDS)
- {
- reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[!r.fCurrentData] ))[i] = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.FirstHitInBin(row) )[i];
- }
-
- const HLTCA_GPU_ROWCOPY* const sourceMem = (const HLTCA_GPU_ROWCOPY *) &row;
- HLTCA_GPU_ROWCOPY* const sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> ( &s.fRow[!r.fCurrentData] );
- for (unsigned int i = iThread;i < sizeof(AliHLTTPCCARow) / sizeof(HLTCA_GPU_ROWCOPY);i += TRACKLET_CONSTRUCTOR_NMEMTHREDS)
- {
- sharedMem[i] = sourceMem[i];
- }
-#endif //!HLTCA_GPU_PREFETCHDATA
-}
-
GPUdi() void AliHLTTPCCATrackletConstructor::StoreTracklet
( int /*nBlocks*/, int /*nThreads*/, int /*iBlock*/, int /*iThread*/,
{
// reconstruction of tracklets, tracklet store step
- //AliHLTTPCCAPerformance::Instance().HNHitsPerTrackCand()->Fill(r.fNHits);
-
do {
- {
- //std::cout<<"tracklet to store: "<<r.fItr<<", nhits = "<<r.fNHits<<std::endl;
- }
-
if ( r.fNHits < TRACKLET_SELECTOR_MIN_HITS ) {
r.fNHits = 0;
break;
tracklet.SetNHits( r.fNHits );
if ( r.fNHits > 0 ) {
-#ifdef DRAW
- if ( 0 ) {
- std::cout << "store tracklet " << r.fItr << ", nhits = " << r.fNHits << std::endl;
- if ( AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kBlue, 1. ) ) {
- AliHLTTPCCADisplay::Instance().Ask();
- }
- }
-#endif //DRAW
if ( CAMath::Abs( tParam.Par()[4] ) < 1.e-4 ) tParam.SetPar( 4, 1.e-4 );
if (r.fStartRow < r.fFirstRow) r.fFirstRow = r.fStartRow;
tracklet.SetFirstRow( r.fFirstRow );
int ih = tracklet.RowHit( iRow );
#endif //EXTERN_ROW_HITS
if ( ih >= 0 ) {
-#if defined(HLTCA_GPUCODE) & !defined(HLTCA_GPU_PREFETCHDATA)
+#if defined(HLTCA_GPUCODE)
tracker.MaximizeHitWeight( s.fRows[ iRow ], ih, w );
#else
tracker.MaximizeHitWeight( tracker.Row( iRow ), ih, w );
-#endif //HLTCA_GPUCODE & !HLTCA_GPU_PREFETCHDATA
+#endif //HLTCA_GPUCODE
}
}
}
{
// reconstruction of tracklets, tracklets update step
- //std::cout<<"Update tracklet: "<<r.fItr<<" "<<r.fGo<<" "<<r.fStage<<" "<<iRow<<std::endl;
- bool drawSearch = 0;//r.fItr==2;
- bool drawFit = 0;//r.fItr==2;
- bool drawFitted = drawFit ;//|| 1;//r.fItr==16;
-
if ( !r.fGo ) return;
#ifndef EXTERN_ROW_HITS
AliHLTTPCCATracklet &tracklet = tracker.Tracklets()[r.fItr];
#endif //EXTERN_ROW_HITS
-#ifdef HLTCA_GPU_PREFETCHDATA
- const AliHLTTPCCARow &row = s.fRow[r.fCurrentData];
-#elif defined(HLTCA_GPUCODE)
+#if defined(HLTCA_GPUCODE)
const AliHLTTPCCARow &row = s.fRows[iRow];
#else
const AliHLTTPCCARow &row = tracker.Row( iRow );
-#endif //HLTCA_GPU_PREFETCHDATA
+#endif //HLTCA_GPUCODE
float y0 = row.Grid().YMin();
float stepY = row.HstepY();
}
-//#ifdef HLTCA_GPU_PREFETCHDATA
-// uint4 *tmpint4 = s.fData[r.fCurrentData];
-//#endif
ushort2 hh;
-//#ifdef HLTCA_GPU_REORDERHITDATA
-// hh = reinterpret_cast<ushort2*>( tmpint4 )[r.fCurrIH];
-//#else
-//#ifdef HLTCA_GPU_PREFETCHDATA
-// hh.x = reinterpret_cast<ushort_v*>( tmpint4 )[r.fCurrIH];
-// hh.y = reinterpret_cast<ushort_v*>( tmpint4 )[NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits()) + r.fCurrIH];
-//#else
#if defined(HLTCA_GPU_TEXTURE_FETCH)
hh = tex1Dfetch(gAliTexRefu2, ((char*) tracker.Data().HitData() - tracker.Data().GPUTextureBase()) / sizeof(ushort2) + row.HitNumberOffset() + r.fCurrIH);
#else
hh = tracker.HitData(row)[r.fCurrIH];
#endif //HLTCA_GPU_TEXTURE_FETCH
-//#endif
-//#endif
int oldIH = r.fCurrIH;
-//#ifdef HLTCA_GPU_PREFETCHDATA
-// r.fCurrIH = reinterpret_cast<short*>( tmpint4 )[2 * NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits()) + r.fCurrIH]; // read from linkup data
-//#else
#if defined(HLTCA_GPU_TEXTURE_FETCH)
r.fCurrIH = tex1Dfetch(gAliTexRefs, ((char*) tracker.Data().HitLinkUpData(row) - tracker.Data().GPUTextureBase()) / sizeof(unsigned short) + r.fCurrIH);
#else
r.fCurrIH = tracker.HitLinkUpData(row)[r.fCurrIH]; // read from linkup data
#endif //HLTCA_GPU_TEXTURE_FETCH
-//#endif
float x = row.X();
float y = y0 + hh.x * stepY;
float z = z0 + hh.y * stepZ;
-#ifdef DRAW
- if ( drawFit ) std::cout << " fit tracklet: new hit " << oldIH << ", xyz=" << x << " " << y << " " << z << std::endl;
-#endif //DRAW
if ( iRow == r.fStartRow ) {
tParam.SetX( x );
tParam.SetZ( z );
r.fLastY = y;
r.fLastZ = z;
- #ifdef DRAW
- if ( drawFit ) std::cout << " fit tracklet " << r.fItr << ", row " << iRow << " first row" << std::endl;
- #endif //DRAW
} else {
float err2Y, err2Z;
tParam.SetCov( 0, err2Y );
tParam.SetCov( 2, err2Z );
}
- if ( drawFit ) {
- #ifdef DRAW
- std::cout << " fit tracklet " << r.fItr << ", row " << iRow << " transporting.." << std::endl;
- std::cout << " params before transport=" << std::endl;
- tParam.Print();
- #endif //DRAW
- }
float sinPhi, cosPhi;
if ( r.fNHits >= 10 && CAMath::Abs( tParam.SinPhi() ) < .99 ) {
sinPhi = tParam.SinPhi();
sinPhi = dy * ri;
cosPhi = dx * ri;
}
- #ifdef DRAW
- if ( drawFit ) std::cout << "sinPhi0 = " << sinPhi << ", cosPhi0 = " << cosPhi << std::endl;
- #endif //DRAW
if ( !tParam.TransportToX( x, sinPhi, cosPhi, tracker.Param().ConstBz(), -1 ) ) {
- #ifdef DRAW
- if ( drawFit ) std::cout << " tracklet " << r.fItr << ", row " << iRow << ": can not transport!!" << std::endl;
- #endif //DRAW
#ifndef EXTERN_ROW_HITS
tracklet.SetRowHit( iRow, -1 );
#else
#endif //EXTERN_ROW_HITS
break;
}
- //std::cout<<"mark1 "<<r.fItr<<std::endl;
- //tParam.Print();
tracker.GetErrors2( iRow, tParam.GetZ(), sinPhi, cosPhi, tParam.GetDzDs(), err2Y, err2Z );
- //std::cout<<"mark2"<<std::endl;
-
- if ( drawFit ) {
- #ifdef DRAW
- std::cout << " params after transport=" << std::endl;
- tParam.Print();
- std::cout << "fit tracklet before filter: " << r.fItr << ", row " << iRow << " errs=" << err2Y << " " << err2Z << std::endl;
- AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kBlue, 2., 1 );
- AliHLTTPCCADisplay::Instance().Ask();
- #endif //DRAW
- }
+
if ( !tParam.Filter( y, z, err2Y, err2Z, .99 ) ) {
- #ifdef DRAW
- if ( drawFit ) std::cout << " tracklet " << r.fItr << ", row " << iRow << ": can not filter!!" << std::endl;
- #endif //DRAW
#ifndef EXTERN_ROW_HITS
tracklet.SetRowHit( iRow, -1 );
#else
#else
tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = oldIH;
#endif //!EXTERN_ROW_HITS
- if ( drawFit ) {
- #ifdef DRAW
- std::cout << "fit tracklet after filter " << r.fItr << ", row " << iRow << std::endl;
- tParam.Print();
- AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kGreen, 2. );
- AliHLTTPCCADisplay::Instance().Ask();
- #endif //DRAW
- }
r.fNHits++;
r.fLastRow = iRow;
r.fEndRow = iRow;
} while ( 0 );
if ( r.fCurrIH < 0 ) {
- #ifdef DRAW
- if ( drawFitted ) std::cout << "fitted tracklet " << r.fItr << ", nhits=" << r.fNHits << std::endl;
- #endif //DRAW
r.fStage = 1;
- //AliHLTTPCCAPerformance::Instance().HNHitsPerSeed()->Fill(r.fNHits);
if ( CAMath::Abs( tParam.SinPhi() ) > .999 ) {
- #ifdef DRAW
- if ( drawFitted ) std::cout << " fitted tracklet error: sinPhi=" << tParam.SinPhi() << std::endl;
- #endif //DRAW
r.fNHits = 0; r.fGo = 0;
} else {
//tParam.SetCosPhi( CAMath::Sqrt(1-tParam.SinPhi()*tParam.SinPhi()) );
}
- if ( drawFitted ) {
- #ifdef DRAW
- std::cout << "fitted tracklet " << r.fItr << " miss=" << r.fNMissed << " go=" << r.fGo << std::endl;
- tParam.Print();
- AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kBlue );
- AliHLTTPCCADisplay::Instance().Ask();
- #endif //DRAW
- }
}
} else { // forward/backward searching part
do {
- if ( drawSearch ) {
- #ifdef DRAW
- std::cout << "search tracklet " << r.fItr << " row " << iRow << " miss=" << r.fNMissed << " go=" << r.fGo << " stage=" << r.fStage << std::endl;
- #endif //DRAW
- }
-
if ( r.fStage == 2 && ( ( iRow >= r.fEndRow ) ||
( iRow >= r.fStartRow && ( iRow - r.fStartRow ) % 2 == 0 )
) ) break;
float x = row.X();
float err2Y, err2Z;
- if ( drawSearch ) {
- #ifdef DRAW
- std::cout << "tracklet " << r.fItr << " before transport to row " << iRow << " : " << std::endl;
- tParam.Print();
- #endif //DRAW
- }
if ( !tParam.TransportToX( x, tParam.SinPhi(), tParam.GetCosPhi(), tracker.Param().ConstBz(), .99 ) ) {
- #ifdef DRAW
- if ( drawSearch ) std::cout << " tracklet " << r.fItr << ", row " << iRow << ": can not transport!!" << std::endl;
- #endif //DRAW
#ifndef EXTERN_ROW_HITS
tracklet.SetRowHit(iRow, -1);
#else
#endif //!EXTERN_ROW_HITS
break;
}
- if ( drawSearch ) {
- #ifdef DRAW
- std::cout << "tracklet " << r.fItr << " after transport to row " << iRow << " : " << std::endl;
- tParam.Print();
- AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kBlue, 2., 1 );
- AliHLTTPCCADisplay::Instance().Ask();
- #endif //DRAW
- }
-#ifdef HLTCA_GPU_PREFETCHDATA
- uint4 *tmpint4 = s.fData[r.fCurrentData];
-#endif //HLTCA_GPU_PREFETCHDATA
-
-//#ifdef HLTCA_GPU_REORDERHITDATA
-// const ushort2 *hits = reinterpret_cast<ushort2*>( tmpint4 );
-//#else
-//#ifdef HLTCA_GPU_PREFETCHDATA
-// const ushort_v *hitsx = reinterpret_cast<ushort_v*>( tmpint4 );
-// const ushort_v *hitsy = reinterpret_cast<ushort_v*>( tmpint4 ) + NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits());
-//#else
+
#ifndef HLTCA_GPU_TEXTURE_FETCH
const ushort2 *hits = tracker.HitData(row);
#endif //!HLTCA_GPU_TEXTURE_FETCH
-//#endif
-//#endif
float fY = tParam.GetY();
float fZ = tParam.GetZ();
unsigned int fHitYfst = 1, fHitYlst = 0, fHitYfst1 = 1, fHitYlst1 = 0;
- if ( drawSearch ) {
-#ifdef DRAW
- std::cout << " tracklet " << r.fItr << ", row " << iRow << ": grid N=" << row.Grid().N() << std::endl;
- std::cout << " tracklet " << r.fItr << ", row " << iRow << ": minbin=" << fIndYmin << std::endl;
-#endif //DRAW
- }
{
int nY = row.Grid().Ny();
-//#ifdef HLTCA_GPU_PREFETCHDATA
-// const unsigned short *sGridP = ( reinterpret_cast<unsigned short*>( tmpint4 ) );
-//#else
#ifndef HLTCA_GPU_TEXTURE_FETCH
const unsigned short *sGridP = tracker.FirstHitInBin(row);
#endif //!HLTCA_GPU_TEXTURE_FETCH
-//#endif
#ifdef HLTCA_GPU_TEXTURE_FETCH
fHitYfst = tex1Dfetch(gAliTexRefu, ((char*) tracker.Data().FirstHitInBin(row) - tracker.Data().GPUTextureBase()) / sizeof(unsigned short) + fIndYmin);
assert( (signed) fHitYlst <= row.NHits() );
assert( (signed) fHitYfst1 <= row.NHits() );
assert( (signed) fHitYlst1 <= row.NHits() );
- if ( drawSearch ) {
-#ifdef DRAW
- std::cout << " Grid, row " << iRow << ": nHits=" << row.NHits() << ", grid n=" << row.Grid().N() << ", c[n]=" << sGridP[row.Grid().N()] << std::endl;
- std::cout << "hit steps = " << stepY << " " << stepZ << std::endl;
- std::cout << " Grid bins:" << std::endl;
- for ( unsigned int i = 0; i < row.Grid().N(); i++ ) {
- std::cout << " bin " << i << ": ";
- for ( int j = sGridP[i]; j < sGridP[i+1]; j++ ) {
- ushort2 hh = hits[j];
- float y = y0 + hh.x * stepY;
- float z = z0 + hh.y * stepZ;
- std::cout << "[" << j << "|" << y << "," << z << "] ";
- }
- std::cout << std::endl;
- }
-#endif //DRAW
- }
-#ifdef DRAW
- if ( sGridP[row.Grid().N()] != row.NHits() ) {
- std::cout << " grid, row " << iRow << ": nHits=" << row.NHits() << ", grid n=" << row.Grid().N() << ", c[n]=" << sGridP[row.Grid().N()] << std::endl;
- //exit(0);
- }
-#endif //DRAW
- }
-#ifdef DRAW
- if ( drawSearch ) {
- std::cout << " tracklet " << r.fItr << ", row " << iRow << ", yz= " << fY << "," << fZ << ": search hits=" << fHitYfst << " " << fHitYlst << " / " << fHitYfst1 << " " << fHitYlst1 << std::endl;
- std::cout << " hit search :" << std::endl;
}
-#endif //DRAW
- for ( unsigned int fIh = fHitYfst; fIh < fHitYlst; fIh++ ) {
+
+ for ( unsigned int fIh = fHitYfst; fIh < fHitYlst; fIh++ ) {
assert( (signed) fIh < row.NHits() );
ushort2 hh;
#if defined(HLTCA_GPU_TEXTURE_FETCH)
int ddy = ( int )( hh.x ) - fY0;
int ddz = ( int )( hh.y ) - fZ0;
int dds = CAMath::Abs( ddy ) + CAMath::Abs( ddz );
- if ( drawSearch ) {
- #ifdef DRAW
- std::cout << fIh << ": hityz= " << hh.x << " " << hh.y << "(" << hh.x*stepY << " " << hh.y*stepZ << "), trackyz=" << fY0 << " " << fZ0 << "(" << fY0*stepY << " " << fZ0*stepZ << "), dy,dz,ds= " << ddy << " " << ddz << " " << dds << "(" << ddy*stepY << " " << ddz*stepZ << std::endl;
- #endif //DRAW
- }
if ( dds < ds ) {
ds = dds;
best = fIh;
int ddy = ( int )( hh.x ) - fY0;
int ddz = ( int )( hh.y ) - fZ0;
int dds = CAMath::Abs( ddy ) + CAMath::Abs( ddz );
- if ( drawSearch ) {
- #ifdef DRAW
- std::cout << fIh << ": hityz= " << hh.x << " " << hh.y << "(" << hh.x*stepY << " " << hh.y*stepZ << "), trackyz=" << fY0 << " " << fZ0 << "(" << fY0*stepY << " " << fZ0*stepZ << "), dy,dz,ds= " << ddy << " " << ddz << " " << dds << "(" << ddy*stepY << " " << ddz*stepZ << std::endl;
- #endif //DRAW
- }
if ( dds < ds ) {
ds = dds;
best = fIh;
#endif //!EXTERN_ROW_HITS
break;
}
- if ( drawSearch ) {
- #ifdef DRAW
- std::cout << "hit search " << r.fItr << ", row " << iRow << " hit " << best << " found" << std::endl;
- AliHLTTPCCADisplay::Instance().DrawSliceHit( iRow, best, kRed, 1. );
- AliHLTTPCCADisplay::Instance().Ask();
- AliHLTTPCCADisplay::Instance().DrawSliceHit( iRow, best, kWhite, 1 );
- AliHLTTPCCADisplay::Instance().DrawSliceHit( iRow, best );
- #endif //DRAW
- }
ushort2 hh;
#if defined(HLTCA_GPU_TEXTURE_FETCH)
hh = hits[best];
#endif //HLTCA_GPU_TEXTURE_FETCH
- //std::cout<<"mark 3, "<<r.fItr<<std::endl;
- //tParam.Print();
tracker.GetErrors2( iRow, *( ( AliHLTTPCCATrackParam* )&tParam ), err2Y, err2Z );
- //std::cout<<"mark 4"<<std::endl;
float y = y0 + hh.x * stepY;
float z = z0 + hh.y * stepZ;
if ( sy2 > 2. ) sy2 = 2.;
if ( sz2 > 2. ) sz2 = 2.;
- if ( drawSearch ) {
- #ifdef DRAW
- std::cout << "dy,sy= " << dy << " " << CAMath::Sqrt( sy2 ) << ", dz,sz= " << dz << " " << CAMath::Sqrt( sz2 ) << std::endl;
- std::cout << "dy,dz= " << dy << " " << dz << ", sy,sz= " << CAMath::Sqrt( sy2 ) << " " << CAMath::Sqrt( sz2 ) << ", sy,sz= " << CAMath::Sqrt( kFactor*( tParam.GetErr2Y() + err2Y ) ) << " " << CAMath::Sqrt( kFactor*( tParam.GetErr2Z() + err2Z ) ) << std::endl;
- #endif //DRAW
- }
if ( CAMath::FMulRZ( dy, dy ) > sy2 || CAMath::FMulRZ( dz, dz ) > sz2 ) {
- if ( drawSearch ) {
- #ifdef DRAW
- std::cout << "found hit is out of the chi2 window\n " << std::endl;
- #endif //DRAW
- }
#ifndef EXTERN_ROW_HITS
tracklet.SetRowHit(iRow, -1);
#else
#endif //!EXTERN_ROW_HITS
break;
}
-#ifdef DRAW
- //if( SAVE() ) hitstore[ iRow ] = best;
- //std::cout<<"hit search before filter: "<<r.fItr<<", row "<<iRow<<std::endl;
- //AliHLTTPCCADisplay::Instance().DrawTracklet(tParam, hitstore, kBlue);
- //AliHLTTPCCADisplay::Instance().Ask();
-#endif //DRAW
if ( !tParam.Filter( y, z, err2Y, err2Z, .99 ) ) {
- if ( drawSearch ) {
- #ifdef DRAW
- std::cout << "tracklet " << r.fItr << " at row " << iRow << " : can not filter!!!! " << std::endl;
- #endif //DRAW
- }
break;
}
#ifndef EXTERN_ROW_HITS
#else
tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = best;
#endif //!EXTERN_ROW_HITS
- if ( drawSearch ) {
- #ifdef DRAW
- std::cout << "tracklet " << r.fItr << " after filter at row " << iRow << " : " << std::endl;
- tParam.Print();
- AliHLTTPCCADisplay::Instance().DrawTracklet( tParam, hitstore, kRed );
- AliHLTTPCCADisplay::Instance().Ask();
- #endif //DRAW
- }
r.fNHits++;
r.fNMissed = 0;
if ( r.fStage == 1 ) r.fLastRow = iRow;
//Fetch a new trackled to be processed by this thread
__syncthreads();
int nextTracketlFirstRun = sMem.fNextTrackletFirstRun;
- if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+ if (threadIdx.x == 0)
{
sMem.fNTracklets = *tracker.NTracklets();
if (sMem.fNextTrackletFirstRun)
else
{
sMem.fNextTrackletFirst = nTracklet.x;
- sMem.fNextTrackletNoDummy = 1;
}
}
#endif //HLTCA_GPU_SCHED_FIXED_START
}
else
{
- const int nFetchTracks = CAMath::Max(CAMath::Min((*tracker.RowBlockPos(Reverse, RowBlock)).x - (*tracker.RowBlockPos(Reverse, RowBlock)).y, HLTCA_GPU_THREAD_COUNT - TRACKLET_CONSTRUCTOR_NMEMTHREDS), 0);
+ const int nFetchTracks = CAMath::Max(CAMath::Min((*tracker.RowBlockPos(Reverse, RowBlock)).x - (*tracker.RowBlockPos(Reverse, RowBlock)).y, HLTCA_GPU_THREAD_COUNT), 0);
sMem.fNextTrackletCount = nFetchTracks;
const int nUseTrack = nFetchTracks ? CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).y, nFetchTracks) : 0;
sMem.fNextTrackletFirst = nUseTrack;
tracker.RowBlockTracklets(Reverse, RowBlock)[(nStartFillTrack + i) % HLTCA_GPU_MAX_TRACKLETS] = -3; //Dummy filling track
}
}
- sMem.fNextTrackletNoDummy = 0;
}
}
__syncthreads();
{
return(-2); //No more track in this RowBlock
}
-#if HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS > 0
- else if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS)
- {
- return(-1);
- }
-#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS > 0
- else if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS >= sMem.fNextTrackletCount)
+ else if (threadIdx.x >= sMem.fNextTrackletCount)
{
return(-1); //No track in this RowBlock for this thread
}
else if (nextTracketlFirstRun)
{
- if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS) sMem.fNextTrackletFirstRun = 0;
+ if (threadIdx.x == 0) sMem.fNextTrackletFirstRun = 0;
mustInit = 1;
- return(sMem.fNextTrackletFirst + threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS);
+ return(sMem.fNextTrackletFirst + threadIdx.x);
}
else
{
- const int nTrackPos = sMem.fNextTrackletFirst + threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS;
+ const int nTrackPos = sMem.fNextTrackletFirst + threadIdx.x;
mustInit = (nTrackPos < tracker.RowBlockPos(Reverse, RowBlock)->w);
volatile int* const ptrTracklet = &tracker.RowBlockTracklets(Reverse, RowBlock)[nTrackPos % HLTCA_GPU_MAX_TRACKLETS];
int nTracklet;
GPUshared() AliHLTTPCCASharedMemory sMem;
#ifdef HLTCA_GPU_SCHED_FIXED_START
- if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+ if (threadIdx.x == 0)
{
sMem.fNextTrackletFirstRun = 1;
}
#endif //HLTCA_GPU_SCHED_FIXED_START
#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
- if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+ if (threadIdx.x == 0)
{
sMem.fMaxSync = 0;
}
__syncthreads();
threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / blockDim.x / gridDim.x);
#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-#ifndef HLTCA_GPU_PREFETCHDATA
if (!sharedRowsInitialized)
{
for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
}
sharedRowsInitialized = 1;
}
-#endif //!HLTCA_GPU_PREFETCHDATA
#ifdef HLTCA_GPU_RESCHED
short2 storeToRowBlock;
int storePosition = 0;
- if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
+ if (threadIdx.x < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
{
- const int nReverse = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
- const int nRowBlock = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+ const int nReverse = threadIdx.x / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+ const int nRowBlock = threadIdx.x % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
sMem.fTrackletStoreCount[nReverse][nRowBlock] = 0;
}
#endif //HLTCA_GPU_RESCHED
{
CopyTrackletTempData( rMemGlobal, rMem, tParamGlobal, tParam );
}
-#ifdef HLTCA_GPU_PREFETCHDATA
- else if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS)
- {
- ReadData(threadIdx.x, sMem, rMem, tracker, iReverse ? (HLTCA_ROW_COUNT - 1 - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP) : (CAMath::Max(1, iRowBlock * HLTCA_GPU_SCHED_ROW_STEP)));
- }
-#endif //HLTCA_GPU_PREFETCHDATA
rMem.fItr = iTracklet;
rMem.fGo = (iTracklet >= 0);
#ifdef HLTCA_GPU_RESCHED
storeToRowBlock.x = iRowBlock + 1;
storeToRowBlock.y = iReverse;
-#ifdef HLTCA_GPU_PREFETCHDATA
- rMem.fCurrentData ^= 1;
- __syncthreads();
-#endif //HLTCA_GPU_PREFETCHDATA
if (iReverse)
{
for (int j = HLTCA_ROW_COUNT - 1 - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP;j >= CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP);j--)
if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(j >= rMem.fEndRow || ( j >= rMem.fStartRow && j - rMem.fStartRow % 2 == 0)))
pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-#ifdef HLTCA_GPU_PREFETCHDATA
- if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS && j > CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP))
- {
- ReadData(threadIdx.x, sMem, rMem, tracker, j - 1);
- }
- else
-#endif //HLTCA_GPU_PREFETCHDATA
if (iTracklet >= 0)
{
UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
if (rMem.fNMissed > kMaxRowGap && j <= rMem.fStartRow)
{
rMem.fGo = 0;
-#ifndef HLTCA_GPU_PREFETCHDATA
break;
-#endif //!HLTCA_GPU_PREFETCHDATA
}
}
-#ifdef HLTCA_GPU_PREFETCHDATA
- __syncthreads();
- rMem.fCurrentData ^= 1;
-#endif //HLTCA_GPU_PREFETCHDATA
}
if (iTracklet >= 0 && (!rMem.fGo || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP))
if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && j >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (j - rMem.fStartRow) % 2 == 0 ))
pTracker[0].StageAtSync()[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
#endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
-#ifdef HLTCA_GPU_PREFETCHDATA
- if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS && j < CAMath::Min((iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP, HLTCA_ROW_COUNT) - 1)
- {
- ReadData(threadIdx.x, sMem, rMem, tracker, j + 1);
- }
- else
-#endif //HLTCA_GPU_PREFETCHDATA
if (iTracklet >= 0)
{
UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
-#ifndef HLTCA_GPU_PREFETCHDATA
//if (rMem.fNMissed > kMaxRowGap || rMem.fGo == 0) break; //DR!!! CUDA Crashes with this enabled
-#endif //!HLTCA_GPU_PREFETCHDATA
}
-#ifdef HLTCA_GPU_PREFETCHDATA
- __syncthreads();
- rMem.fCurrentData ^= 1;
-#endif //HLTCA_GPU_PREFETCHDATA
}
if (rMem.fGo && (rMem.fNMissed > kMaxRowGap || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP))
{
-#if defined(HLTCA_GPU_PREFETCHDATA)
- if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) )
-#else
if ( !tParam.TransportToX( sMem.fRows[ rMem.fEndRow ].X(), tracker.Param().ConstBz(), .999 ) )
-#endif //HLTCA_GPU_PREFETCHDATA
{
rMem.fGo = 0;
}
#ifdef HLTCA_GPU_RESCHED
__syncthreads();
- if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
+ if (threadIdx.x < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
{
- const int nReverse = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
- const int nRowBlock = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+ const int nReverse = threadIdx.x / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+ const int nRowBlock = threadIdx.x % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
if (sMem.fTrackletStoreCount[nReverse][nRowBlock])
{
sMem.fTrackletStoreCount[nReverse][nRowBlock] = CAMath::AtomicAdd(&tracker.RowBlockPos(nReverse, nRowBlock)->x, sMem.fTrackletStoreCount[nReverse][nRowBlock]);