Update of the GPU tracker from David Rohr
authorsgorbuno <sgorbuno@f7af4fe6-9843-0410-8265-dc069ae4e863>
Thu, 1 Oct 2009 13:35:10 +0000 (13:35 +0000)
committersgorbuno <sgorbuno@f7af4fe6-9843-0410-8265-dc069ae4e863>
Thu, 1 Oct 2009 13:35:10 +0000 (13:35 +0000)
49 files changed:
HLT/TPCLib/tracking-ca/AliHLTTPCCAClusterData.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCAClusterData.h
HLT/TPCLib/tracking-ca/AliHLTTPCCADef.h
HLT/TPCLib/tracking-ca/AliHLTTPCCADisplay.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUConfig.h [new file with mode: 0644]
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu.patch [new file with mode: 0644]
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cxx [new file with mode: 0644]
HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCAGrid.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAHit.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCAHitArea.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAHitId.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAMerger.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursCleaner.h
HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursFinder.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCANeighboursFinder.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAParam.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAPerformance.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCAProcess.h
HLT/TPCLib/tracking-ca/AliHLTTPCCARow.h
HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCASliceData.h
HLT/TPCLib/tracking-ca/AliHLTTPCCASliceOutput.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCASliceOutput.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAStandaloneFramework.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCAStandaloneFramework.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsFinder.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsFinder.h
HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.cxx [new file with mode: 0644]
HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.h [new file with mode: 0644]
HLT/TPCLib/tracking-ca/AliHLTTPCCATrack.h
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackLinearisation.h
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackParam.h
HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCATracker.h
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerComponent.h
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx [new file with mode: 0644]
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.h [new file with mode: 0644]
HLT/TPCLib/tracking-ca/AliHLTTPCCATracklet.h
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletConstructor.h
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletSelector.cxx
HLT/TPCLib/tracking-ca/AliHLTTPCCATrackletSelector.h
HLT/TPCLib/tracking-ca/AliTPCtrackerCA.cxx
HLT/TPCLib/tracking-ca/MemoryAssignmentHelpers.h

index cf07955..321eccd 100644 (file)
@@ -18,6 +18,7 @@
 #include "AliHLTTPCCAMath.h"
 #include <algorithm>
 #include "AliHLTArray.h"
+#include "AliHLTTPCCAGPUConfig.h"
 
 void AliHLTTPCCAClusterData::StartReading( int sliceIndex, int guessForNumberOfClusters )
 {
@@ -27,8 +28,8 @@ void AliHLTTPCCAClusterData::StartReading( int sliceIndex, int guessForNumberOfC
   fFirstRow = 0;
   fLastRow = 0;
   fData.clear();
-  fNumberOfClusters.reserve( 160 );
-  fRowOffset.reserve( 160 );
+  fNumberOfClusters.reserve( HLTCA_ROW_COUNT + 1 );
+  fRowOffset.reserve( HLTCA_ROW_COUNT + 1 );
   fData.reserve( CAMath::Max( 64, guessForNumberOfClusters ) );
 }
 
index 758b12f..df9c1f6 100644 (file)
@@ -17,6 +17,7 @@
 #ifndef ALIHLTTPCCACLUSTERDATA_H
 #define ALIHLTTPCCACLUSTERDATA_H
 
+#include "AliHLTTPCCADef.h"
 #include <iostream>
 #include <vector>
 
@@ -29,7 +30,7 @@ class AliHLTTPCCAClusterData
 {
   public:
 
-    AliHLTTPCCAClusterData(): fSliceIndex( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fNumberOfClusters(), fRowOffset(), fData() {}
+    AliHLTTPCCAClusterData(): fSliceIndex( 0 ), fFirstRow( 0 ), fLastRow( -1 ), fNumberOfClusters(), fRowOffset(), fData() {}
 
     /**
      * prepare for the reading of event
@@ -151,8 +152,11 @@ class AliHLTTPCCAClusterData
      */
     void Merge( int index1, int index2 );
 
-
-    static bool CompareClusters( const Data &a, const Data &b ) { return ( a.fRow < b.fRow ); }
+#ifdef REPRODUCIBLE_CLUSTER_SORTING
+       static bool CompareClusters( const Data &a, const Data &b ) { return ( a.fRow >= b.fRow ? (a.fId < b.fId) : (a.fRow < b.fRow) ); }
+#else
+       static bool CompareClusters( const Data &a, const Data &b ) { return ( (a.fRow < b.fRow) ); }
+#endif
 
     int fSliceIndex;  // the slice index this data belongs to
     int fFirstRow; // see FirstRow()
index 88fefc6..9903e79 100644 (file)
 #endif
 #endif
 
-#if defined(HLTCA_STANDALONE) || defined(HLTCA_GPUCODE)
+#if defined(HLTCA_STANDALONE)
 
 // class TObject{};
 
+#ifdef ClassDef
+#undef ClassDef
+#endif
+
+#ifdef ClassTmp
+#undef ClassTmp
+#endif
+
 #define ClassDef(name,id)
 #define ClassImp(name)
 
@@ -96,16 +104,20 @@ namespace AliHLTTPCCADefinitions
 
 #endif
 
+//#define EXTERN_ROW_HITS
+#define TRACKLET_SELECTOR_MIN_HITS 10
+#define REPRODUCIBLE_CLUSTER_SORTING
+
 #ifdef HLTCA_GPUCODE
-#define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP 5
-#define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_FGRIDCONTENTUPDOWN 700
+#define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP 6
+#define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_FGRIDCONTENTUPDOWN 1000
 #define ALIHLTTPCCASTARTHITSFINDER_MAX_FROWSTARTHITS 3500
-#define ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM 650
+#define ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM 1536                                  //Max amount of hits in a row that can be stored in shared memory, make sure this is divisible by ROW ALIGNMENT
 #else
 #define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP 20
 #define ALIHLTTPCCANEIGHBOURS_FINDER_MAX_FGRIDCONTENTUPDOWN 7000
 #define ALIHLTTPCCASTARTHITSFINDER_MAX_FROWSTARTHITS 10000
-#define ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM 5000
+#define ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM 15000
 #endif
 
 #ifdef HLTCA_GPUCODE
@@ -118,8 +130,6 @@ namespace AliHLTTPCCADefinitions
 #define GPUshared() __shared__
 #define GPUsync() __syncthreads()
 
-__constant__ float4 gAliHLTTPCCATracker[30000/sizeof( float4 )];
-
 #else
 
 #define GPUd()
@@ -130,9 +140,15 @@ __constant__ float4 gAliHLTTPCCATracker[30000/sizeof( float4 )];
 #define GPUsync()
 
 struct float2 { float x; float y; };
-struct uchar2 { unsigned char x; unsigned char y; };
-struct ushort2 { unsigned short x; unsigned short y; };
+struct uchar2 { unsigned char x, y; };
+struct short2 { short x, y; };
+struct ushort2 { unsigned short x, y; };
+struct int2 { int x, y; };
+struct int3 { int x, y, z; };
+struct int4 { int x, y, z, w; };
 struct uint1 { unsigned int x; };
+struct uint2 { unsigned int x, y; };
+struct uint3 { unsigned int x, y, z; };
 struct uint4 { unsigned int x, y, z, w; };
 
 #ifdef R__WIN32
@@ -149,6 +165,8 @@ inline bool finite(float x)
 /*
  * Helper for compile-time verification of correct API usage
  */
+
+#ifndef HLTCA_GPUCODE
 namespace
 {
   template<bool> struct HLTTPCCA_STATIC_ASSERT_FAILURE;
@@ -161,6 +179,9 @@ namespace
   typedef HLTTPCCA_STATIC_ASSERT_FAILURE<cond> HLTTPCCA_STATIC_ASSERT_CONCAT(_STATIC_ASSERTION_FAILED_##msg, __LINE__); \
   HLTTPCCA_STATIC_ASSERT_CONCAT(_STATIC_ASSERTION_FAILED_##msg, __LINE__) Error_##msg; \
   (void) Error_##msg
+#else
+#define STATIC_ASSERT(a, b)
+#endif
 
 namespace
 {
@@ -184,4 +205,10 @@ namespace
   void UNUSED_PARAM9( const T1 &, const T2 &, const T3 &, const T4 &, const T5 &, const T6 &, const T7 &, const T8 &, const T9 & ) {}
 }
 
+#define UNROLL2(var, code) code;var++;code;var++;
+#define UNROLL4(var, code) UNROLL2(var, code) UNROLL2(var, code)
+#define UNROLL8(var, code) UNROLL4(var, code) UNROLL4(var, code)
+#define UNROLL16(var, code) UNROLL8(var, code) UNROLL8(var, code)
+#define UNROLL32(var, code) UNROLL16(var, code) UNROLL16(var, code)
+
 #endif
index 1fb1528..4c2fcb7 100644 (file)
@@ -813,7 +813,7 @@ void AliHLTTPCCADisplay::DrawSliceOutTrack( AliHLTTPCCATrackParam &/*t*/, double
   AliHLTTPCCADisplayTmpHit vHits[200];
 
   for ( int ih = 0; ih < track.NHits(); ih++ ) {
-    int id = tracker.FirstSliceHit()[fSlice->Param().ISlice()] + fSlice->OutTrackHits()[track.FirstHitRef()+ih];
+    int id = tracker.FirstSliceHit()[fSlice->Param().ISlice()] + fSlice->OutTrackHit(track.FirstHitRef()+ih);
     const AliHLTTPCCAGBHit &h = tracker.Hits()[id];
     vHits[ih].SetID( id );
     vHits[ih].SetS( 0 );
@@ -835,7 +835,7 @@ void AliHLTTPCCADisplay::DrawSliceOutTrack( int /*itr*/, int /*color*/, int /*wi
   AliHLTTPCCADisplayTmpHit vHits[200];
 
   for ( int ih = 0; ih < track.NHits(); ih++ ) {
-    int id = tracker.FirstSliceHit()[fSlice->Param().ISlice()] + fSlice->OutTrackHits()[track.FirstHitRef()+ih];
+    int id = tracker.FirstSliceHit()[fSlice->Param().ISlice()] + fSlice->OutTrackHit(track.FirstHitRef()+ih);
     const AliHLTTPCCAGBHit &h = tracker.Hits()[id];
     vHits[ih].SetID( id );
     vHits[ih].SetS( 0 );
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUConfig.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUConfig.h
new file mode 100644 (file)
index 0000000..668b42f
--- /dev/null
@@ -0,0 +1,73 @@
+#ifndef ALIHLTTPCCAGPUCONFIG_H\r
+#define ALIHLTTPCCAGPUCONFIG_H\r
+\r
+#define HLTCA_GPU_BLOCK_COUNT 30\r
+#define HLTCA_GPU_THREAD_COUNT 256\r
+\r
+#define HLTCA_GPU_WARP_SIZE 32\r
+#define HLTCA_GPU_REGS 64\r
+#define HLTCA_ROW_COUNT 159\r
+\r
+#define HLTCA_GPU_ROWALIGNMENT uint4\r
+#define HLTCA_GPU_ROWCOPY int\r
+#define HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS 32
+//#define HLTCA_GPU_PREFETCHDATA
+//#define HLTCA_GPU_PREFETCH_ROWBLOCK_ONLY
+
+#define HLTCA_GPU_SCHED_ROW_STEP 32
+#define HLTCA_GPU_SCHED_FIXED_START
+//#define HLTCA_GPU_SCHED_FIXED_SLICE
+#define HLTCA_GPU_RESCHED
+
+#define HLTCA_GPU_TEXTURE_FETCH
+
+//#define HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+
+#define HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE 12
+#define HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT 3              //Currently must be smaller than avaiable MultiProcessors on GPU or will result in wrong results
+
+#define HLTCA_GPU_SORT_DUMPDATA
+
+#define HLTCA_GPU_MAX_TRACKLETS 12288                                  //Should be divisible by 16 at least
+#define HLTCA_GPU_MAX_TRACKS 3072\r
+
+//#define HLTCA_GPU_EMULATION_SINGLE_TRACKLET 1313
+//#define HLTCA_GPU_EMULATION_DEBUG_TRACKLET 1313
+
+#define HLTCA_GPU_TRACKER_CONSTANT_MEM 65000
+
+#define HLTCA_GPU_TRACKER_OBJECT_MEMORY 1024 * 1024
+#define HLTCA_GPU_ROWS_MEMORY 1024 * 1024
+#define HLTCA_GPU_COMMON_MEMORY 1024 * 1024
+#define HLTCA_GPU_SLICE_DATA_MEMORY 7 * 1024 * 1024
+#define HLTCA_GPU_GLOBAL_MEMORY 20 * 1024 * 1024
+#define HLTCA_GPU_TRACKS_MEMORY 2 * 1024 * 1024
+
+#ifndef HLTCA_GPUCODE
+#ifdef HLTCA_GPU_TEXTURE_FETCH
+#undef HLTCA_GPU_TEXTURE_FETCH
+#endif
+
+#ifdef HLTCA_GPU_PREFETCHDATA
+#undef HLTCA_GPU_PREFETCHDATA
+#endif
+
+#undef HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE
+#define HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE 0
+
+#else
+#define HLTCA_GPU_SORT_STARTHITS
+#endif
+
+#if !defined(HLTCA_GPU_PREFETCHDATA) | !defined(HLTCA_GPU_RESCHED)
+#undef HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS
+#define HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS 0
+#endif
+
+#define HLTCA_GPU_ERROR_NONE 0
+#define HLTCA_GPU_ERROR_ROWBLOCK_TRACKLET_OVERFLOW 1
+#define HLTCA_GPU_ERROR_TRACKLET_OVERFLOW 2\r
+#define HLTCA_GPU_ERROR_TRACK_OVERFLOW 3\r
+#define HLTCA_GPU_ERROR_SCHEDULE_COLLISION 4\r
+
+#endif
\ No newline at end of file
index b881a5e..e5530ce 100644 (file)
@@ -17,8 +17,9 @@
 //                                                                          *
 //***************************************************************************
 
-#include <cutil.h>
-#include <cutil_inline_runtime.h>
+#include "AliHLTTPCCADef.h"
+#include "AliHLTTPCCAGPUConfig.h"
+
 #include <sm_11_atomic_functions.h>
 #include <sm_12_atomic_functions.h>
 
 
 #include "AliHLTTPCCAGPUTracker.h"
 
-#ifdef BUILD_GPU
+__constant__ float4 gAliHLTTPCCATracker[HLTCA_GPU_TRACKER_CONSTANT_MEM / sizeof( float4 )];
+#ifdef HLTCA_GPU_TEXTURE_FETCH
+texture<ushort2, 1, cudaReadModeElementType> gAliTexRefu2;
+texture<unsigned short, 1, cudaReadModeElementType> gAliTexRefu;
+texture<signed short, 1, cudaReadModeElementType> gAliTexRefs;
+#endif
+
+#include "AliHLTTPCCAHit.h"
 
 //Include CXX Files, GPUd() macro will then produce CUDA device code out of the tracker source code
 #include "AliHLTTPCCATrackParam.cxx"
 
 #include "AliHLTTPCCANeighboursCleaner.cxx"
 #include "AliHLTTPCCAStartHitsFinder.cxx"
+#include "AliHLTTPCCAStartHitsSorter.cxx"
 #include "AliHLTTPCCATrackletConstructor.cxx"
 #include "AliHLTTPCCASliceOutput.cxx"
 
+#include "MemoryAssignmentHelpers.h"
+
+#ifndef HLTCA_STANDALONE
+#include "AliHLTDefinitions.h"
+#include "AliHLTSystem.h"
 #endif
 
-AliHLTTPCCAGPUTracker::AliHLTTPCCAGPUTracker() : gpuTracker(), DebugLevel(0) {}
-AliHLTTPCCAGPUTracker::~AliHLTTPCCAGPUTracker() {}
+ClassImp( AliHLTTPCCAGPUTracker )
 
-//Find best CUDA device, initialize and allocate memory
-int AliHLTTPCCAGPUTracker::InitGPU()
+int AliHLTTPCCAGPUTracker::InitGPU(int sliceCount, int forceDeviceID)
 {
-#ifdef BUILD_GPU
-       int cudaDevice = cutGetMaxGflopsDeviceId();
-       cudaSetDevice(cudaDevice);
-
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop ,0 ); 
-  std::cout<<"CUDA Device Properties: "<<std::endl;
-  std::cout<<"name = "<<prop.name<<std::endl;
-  std::cout<<"totalGlobalMem = "<<prop.totalGlobalMem<<std::endl;
-  std::cout<<"sharedMemPerBlock = "<<prop.sharedMemPerBlock<<std::endl;
-  std::cout<<"regsPerBlock = "<<prop.regsPerBlock<<std::endl;
-  std::cout<<"warpSize = "<<prop.warpSize<<std::endl;
-  std::cout<<"memPitch = "<<prop.memPitch<<std::endl;
-  std::cout<<"maxThreadsPerBlock = "<<prop.maxThreadsPerBlock<<std::endl;
-  std::cout<<"maxThreadsDim = "<<prop.maxThreadsDim[0]<<" "<<prop.maxThreadsDim[1]<<" "<<prop.maxThreadsDim[2]<<std::endl;
-  std::cout<<"maxGridSize = "  <<prop.maxGridSize[0]<<" "<<prop.maxGridSize[1]<<" "<<prop.maxGridSize[2]<<std::endl;
-  std::cout<<"totalConstMem = "<<prop.totalConstMem<<std::endl;
-  std::cout<<"major = "<<prop.major<<std::endl;
-  std::cout<<"minor = "<<prop.minor<<std::endl;
-  std::cout<<"clockRate = "<<prop.clockRate<<std::endl;
-  std::cout<<"textureAlignment = "<<prop.textureAlignment<<std::endl;
-
-  GPUMemSize = (int) prop.totalGlobalMem - 400 * 1024 * 1024;
-  if (CUDA_FAILED_MSG(cudaMalloc(&GPUMemory, (size_t) GPUMemSize)))
+       //Find best CUDA device, initialize and allocate memory
+
+       cudaDeviceProp fCudaDeviceProp;
+
+#ifndef CUDA_DEVICE_EMULATION
+       int count, bestDevice = -1, bestDeviceSpeed = 0;
+       if (CudaFailedMsg(cudaGetDeviceCount(&count)))
+       {
+               HLTError("Error getting CUDA Device Count");
+               return(1);
+       }
+       if (fDebugLevel >= 2) std::cout << "Available CUDA devices: ";
+       for (int i = 0;i < count;i++)
+       {
+               cudaGetDeviceProperties(&fCudaDeviceProp, i);
+               if (fDebugLevel >= 2) std::cout << fCudaDeviceProp.name << " (" << i << ")     ";
+               if (fCudaDeviceProp.major < 9 && !(fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2)) && fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate > bestDeviceSpeed)
+               {
+                       bestDevice = i;
+                       bestDeviceSpeed = fCudaDeviceProp.multiProcessorCount * fCudaDeviceProp.clockRate;
+               }
+       }
+       if (fDebugLevel >= 2) std::cout << std::endl;
+
+       if (bestDevice == -1)
+       {
+               HLTWarning("No CUDA Device available, aborting CUDA Initialisation");
+               return(1);
+       }
+
+  int cudaDevice;
+  if (forceDeviceID == -1)
+         cudaDevice = bestDevice;
+  else
+         cudaDevice = forceDeviceID;
+#else
+       int cudaDevice = 0;
+#endif
+
+  cudaGetDeviceProperties(&fCudaDeviceProp ,cudaDevice ); 
+
+  if (fDebugLevel >= 1)
   {
-         std::cout << "CUDA Memory Allocation Error\n";
+         std::cout<<"CUDA Device Properties: "<<std::endl;
+         std::cout<<"name = "<<fCudaDeviceProp.name<<std::endl;
+         std::cout<<"totalGlobalMem = "<<fCudaDeviceProp.totalGlobalMem<<std::endl;
+         std::cout<<"sharedMemPerBlock = "<<fCudaDeviceProp.sharedMemPerBlock<<std::endl;
+         std::cout<<"regsPerBlock = "<<fCudaDeviceProp.regsPerBlock<<std::endl;
+         std::cout<<"warpSize = "<<fCudaDeviceProp.warpSize<<std::endl;
+         std::cout<<"memPitch = "<<fCudaDeviceProp.memPitch<<std::endl;
+         std::cout<<"maxThreadsPerBlock = "<<fCudaDeviceProp.maxThreadsPerBlock<<std::endl;
+         std::cout<<"maxThreadsDim = "<<fCudaDeviceProp.maxThreadsDim[0]<<" "<<fCudaDeviceProp.maxThreadsDim[1]<<" "<<fCudaDeviceProp.maxThreadsDim[2]<<std::endl;
+         std::cout<<"maxGridSize = "  <<fCudaDeviceProp.maxGridSize[0]<<" "<<fCudaDeviceProp.maxGridSize[1]<<" "<<fCudaDeviceProp.maxGridSize[2]<<std::endl;
+         std::cout<<"totalConstMem = "<<fCudaDeviceProp.totalConstMem<<std::endl;
+         std::cout<<"major = "<<fCudaDeviceProp.major<<std::endl;
+         std::cout<<"minor = "<<fCudaDeviceProp.minor<<std::endl;
+         std::cout<<"clockRate = "<<fCudaDeviceProp.clockRate<<std::endl;
+         std::cout<<"textureAlignment = "<<fCudaDeviceProp.textureAlignment<<std::endl;
+  }
+
+  if (fCudaDeviceProp.major < 1 || (fCudaDeviceProp.major == 1 && fCudaDeviceProp.minor < 2))
+  {
+       HLTError( "Unsupported CUDA Device" );
          return(1);
   }
-  std::cout << "CUDA Initialisation successfull\n";
-#endif
 
-       return(0);
+  if (CudaFailedMsg(cudaSetDevice(cudaDevice)))
+  {
+         HLTError("Could not set CUDA Device!");
+         return(1);
+  }
+
+  if (fgkNSlices * AliHLTTPCCATracker::CommonMemorySize() > HLTCA_GPU_COMMON_MEMORY)
+  {
+         HLTError("Insufficiant Common Memory");
+         cudaThreadExit();
+         return(1);
+  }
+
+  if (fgkNSlices * (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow) > HLTCA_GPU_ROWS_MEMORY)
+  {
+         HLTError("Insufficiant Row Memory");
+         cudaThreadExit();
+         return(1);
+  }
+
+  fGPUMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_GLOBAL_MEMORY);
+  if (fGPUMemSize > fCudaDeviceProp.totalGlobalMem || CudaFailedMsg(cudaMalloc(&fGPUMemory, (size_t) fGPUMemSize)))
+  {
+         HLTError("CUDA Memory Allocation Error");
+         cudaThreadExit();
+         return(1);
+  }
+  if (fDebugLevel >= 1) HLTInfo("GPU Memory used: %d", (int) fGPUMemSize);
+  int hostMemSize = HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + sliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + HLTCA_GPU_TRACKER_OBJECT_MEMORY;
+  if (CudaFailedMsg(cudaMallocHost(&fHostLockedMemory, hostMemSize)))
+  {
+         cudaFree(fGPUMemory);\r
+         cudaThreadExit();\r
+         HLTError("Error allocating Page Locked Host Memory");
+         return(1);
+  }
+  if (fDebugLevel >= 1) HLTInfo("Host Memory used: %d", hostMemSize);
+
+  if (fDebugLevel >= 1)
+  {
+         CudaFailedMsg(cudaMemset(fGPUMemory, 143, (size_t) fGPUMemSize));
+  }
+  HLTInfo("CUDA Initialisation successfull");
+
+  //Don't run constructor / destructor here, this will be just local memcopy of Tracker in GPU Memory
+  if (sizeof(AliHLTTPCCATracker) * sliceCount > HLTCA_GPU_TRACKER_OBJECT_MEMORY)
+  {
+         HLTError("Insufficiant Tracker Object Memory");
+         return(1);
+  }
+  fSliceCount = sliceCount;
+  fGpuTracker = (AliHLTTPCCATracker*) TrackerMemory(fHostLockedMemory, 0);
+
+  for (int i = 0;i < fgkNSlices;i++)
+  {
+    fSlaveTrackers[i].SetGPUTracker();
+       fSlaveTrackers[i].SetGPUTrackerCommonMemory((char*) CommonMemory(fHostLockedMemory, i));
+       fSlaveTrackers[i].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, i), RowMemory(fHostLockedMemory, i));
+  }
+
+  fpCudaStreams = malloc(CAMath::Max(3, fSliceCount) * sizeof(cudaStream_t));
+  cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
+  for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
+  {
+       if (CudaFailedMsg(cudaStreamCreate(&cudaStreams[i])))
+       {
+               HLTError("Error creating CUDA Stream");
+               return(1);
+       }
+  }
+
+#if defined(HLTCA_STANDALONE) & !defined(CUDA_DEVICE_EMULATION)
+  if (fDebugLevel < 2)
+  {
+         //Do one initial run for Benchmark reasons
+         const int useDebugLevel = fDebugLevel;
+         fDebugLevel = 0;
+         AliHLTTPCCAClusterData tmpCluster;
+         AliHLTTPCCASliceOutput tmpOutput;
+         AliHLTTPCCAParam tmpParam;
+         tmpParam.SetNRows(HLTCA_ROW_COUNT);
+         fSlaveTrackers[0].SetParam(tmpParam);
+         Reconstruct(&tmpOutput, &tmpCluster, 0, 1);
+         fDebugLevel = useDebugLevel;
+  }
+#endif
+  return(0);
 }
 
-//Macro to align Pointers.
-//Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
-//(As long as every single data structure is <= 1 MB)
 template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment)
 {
+       //Macro to align Pointers.
+       //Will align to start at 1 MB segments, this should be consistent with every alignment in the tracker
+       //(As long as every single data structure is <= 1 MB)
+
        size_t adr = (size_t) ptr;
        if (adr % alignment)
        {
@@ -112,216 +246,561 @@ template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int ali
        return((T*) adr);
 }
 
-//Check for CUDA Error and in the case of an error display the corresponding error string
-bool AliHLTTPCCAGPUTracker::CUDA_FAILED_MSG(cudaError_t error)
+bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error)
 {
+       //Check for CUDA Error and in the case of an error display the corresponding error string
        if (error == cudaSuccess) return(false);
-       printf("CUDA Error: %d / %s\n", error, cudaGetErrorString(error));
+       HLTWarning("CUDA Error: %d / %s", error, cudaGetErrorString(error));
        return(true);
 }
 
-//Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
-int AliHLTTPCCAGPUTracker::CUDASync()
+int AliHLTTPCCAGPUTracker::CUDASync(char* state)
 {
-       if (DebugLevel == 0) return(0);
+       //Wait for CUDA-Kernel to finish and check for CUDA errors afterwards
+
+       if (fDebugLevel == 0) return(0);
        cudaError cuErr;
        cuErr = cudaGetLastError();
        if (cuErr != cudaSuccess)
        {
-               printf("Cuda Error %s while invoking kernel\n", cudaGetErrorString(cuErr));
+               HLTError("Cuda Error %s while invoking kernel (%s)", cudaGetErrorString(cuErr), state);
                return(1);
        }
-       if (CUDA_FAILED_MSG(cudaThreadSynchronize()))
+       if (CudaFailedMsg(cudaThreadSynchronize()))
        {
-               printf("CUDA Error while synchronizing\n");
+               HLTError("CUDA Error while synchronizing (%s)", state);
                return(1);
        }
-       if (DebugLevel >= 4) printf("CUDA Sync Done\n");
+       if (fDebugLevel >= 5) HLTInfo("CUDA Sync Done");
        return(0);
 }
 
-void AliHLTTPCCAGPUTracker::SetDebugLevel(int dwLevel, std::ostream *NewOutFile)
+void AliHLTTPCCAGPUTracker::SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile)
 {
-       DebugLevel = dwLevel;
-       if (NewOutFile) OutFile = NewOutFile;
+       //Set Debug Level and Debug output File if applicable
+       fDebugLevel = dwLevel;
+       if (NewOutFile) fOutFile = NewOutFile;
 }
 
-//Primary reconstruction function
-int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCATracker* tracker)
+int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* OptionName, int OptionValue)
 {
-    int nThreads;
-    int nBlocks;
-       int size;
-
-       if (tracker->CheckEmptySlice())
+       //Set a specific GPU Tracker Option
        {
-               if (DebugLevel >= 4) printf("Slice Empty, not running GPU Tracker\n");
-               return(0);
+               HLTError("Unknown Option: %s", OptionName);
+               return(1);
        }
+       //No Options used at the moment
+       //return(0);
+}
 
-       if (DebugLevel >= 3)
+#ifdef HLTCA_STANDALONE
+void AliHLTTPCCAGPUTracker::StandalonePerfTime(int iSlice, int i)
+{
+  //Run Performance Query for timer i of slice iSlice
+  if (fDebugLevel >= 1)
+  {
+         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime( fSlaveTrackers[iSlice].PerfTimer(i));
+  }
+}
+#else
+void AliHLTTPCCAGPUTracker::StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
+#endif
+
+void AliHLTTPCCAGPUTracker::DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check)
+{
+       //Dump Rowblocks to File
+       if (fDebugLevel >= 4)
        {
-               *OutFile << endl << endl << "Slice: " << tracker->Param().ISlice() << endl;
+               *fOutFile << "RowBlock Tracklets" << std::endl;
+       
+               int4* rowBlockPos = (int4*) malloc(sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2);
+               int* rowBlockTracklets = (int*) malloc(sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2);
+               uint2* blockStartingTracklet = (uint2*) malloc(sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT);
+               CudaFailedMsg(cudaMemcpy(rowBlockPos, fGpuTracker[iSlice].RowBlockPos(), sizeof(int4) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * 2, cudaMemcpyDeviceToHost));
+               CudaFailedMsg(cudaMemcpy(rowBlockTracklets, fGpuTracker[iSlice].RowBlockTracklets(), sizeof(int) * (tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2, cudaMemcpyDeviceToHost));
+               CudaFailedMsg(cudaMemcpy(blockStartingTracklet, fGpuTracker[iSlice].BlockStartingTracklet(), sizeof(uint2) * HLTCA_GPU_BLOCK_COUNT, cudaMemcpyDeviceToHost));
+               CudaFailedMsg(cudaMemcpy(tracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
+
+               int k = tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet;
+               for (int i = 0; i < tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1;i++)
+               {
+                       *fOutFile << "Rowblock: " << i << ", up " << rowBlockPos[i].y << "/" << rowBlockPos[i].x << ", down " << 
+                               rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].y << "/" << rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x << endl << "Phase 1: ";
+                       for (int j = 0;j < rowBlockPos[i].x;j++)
+                       {
+                               //Use Tracker Object to calculate Offset instead of fGpuTracker, since *fNTracklets of fGpuTracker points to GPU Mem!
+                               *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
+                               if (check && rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] != k)
+                               {
+                                       HLTError("Wrong starting Row Block %d, entry %d, is %d, should be %d", i, j, rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j], k);
+                               }
+                               k++;
+                               if (rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(0, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] == -1)
+                               {
+                                       HLTError("Error, -1 Tracklet found");
+                               }
+                       }
+                       *fOutFile << endl << "Phase 2: ";
+                       for (int j = 0;j < rowBlockPos[tracker[iSlice].Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1 + i].x;j++)
+                       {
+                               *fOutFile << rowBlockTracklets[(tracker[iSlice].RowBlockTracklets(1, i) - tracker[iSlice].RowBlockTracklets(0, 0)) + j] << ", ";
+                       }
+                       *fOutFile << endl;
+               }
+
+               if (check)
+               {
+                       *fOutFile << "Starting Threads: (First Dynamic: " << tracker[iSlice].GPUParameters()->fScheduleFirstDynamicTracklet << ")" << std::endl;
+                       for (int i = 0;i < HLTCA_GPU_BLOCK_COUNT;i++)
+                       {
+                               *fOutFile << i << ": " << blockStartingTracklet[i].x << " - " << blockStartingTracklet[i].y << std::endl;
+                       }
+               }
+
+               free(rowBlockPos);
+               free(rowBlockTracklets);
+               free(blockStartingTracklet);
        }
+}
 
-       if (DebugLevel >= 4) printf("\n\nInitialising GPU Tracker\n");
-       memcpy(&gpuTracker, tracker, sizeof(AliHLTTPCCATracker));
-       char* tmpMem = alignPointer((char*) GPUMemory, 1024 * 1024);
-       gpuTracker.SetGPUTracker();
+__global__ void PreInitRowBlocks(int4* const RowBlockPos, int* const RowBlockTracklets, int* const SliceDataHitWeights, int nSliceDataHits)
+{
+       //Initialize GPU RowBlocks and HitWeights
+       int4* const rowBlockTracklets4 = (int4*) RowBlockTracklets;
+       int4* const sliceDataHitWeights4 = (int4*) SliceDataHitWeights;
+       const int stride = blockDim.x * gridDim.x;
+       int4 i0, i1;
+       i0.x = i0.y = i0.z = i0.w = 0;
+       i1.x = i1.y = i1.z = i1.w = -1;
+       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int4) * 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) / sizeof(int4);i += stride)
+               RowBlockPos[i] = i0;
+       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < sizeof(int) * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_MAX_TRACKLETS * 2 / sizeof(int4);i += stride)
+               rowBlockTracklets4[i] = i1;
+       for (int i = blockIdx.x * blockDim.x + threadIdx.x;i < nSliceDataHits * sizeof(int) / sizeof(int4);i += stride)
+               sliceDataHitWeights4[i] = i0;
+}
 
-       if (DebugLevel >= 4) printf("Initialising GPU Common Memory\n");
-       tmpMem = gpuTracker.SetGPUTrackerCommonMemory(tmpMem);
-       tmpMem = alignPointer(tmpMem, 1024 * 1024);
+int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCASliceOutput* pOutput, AliHLTTPCCAClusterData* pClusterData, int firstSlice, int sliceCountLocal)
+{
+       //Primary reconstruction function
+       cudaStream_t* const cudaStreams = (cudaStream_t*) fpCudaStreams;
 
-       if (DebugLevel >= 4) printf("Initialising GPU Hits Memory\n");
-       tmpMem = gpuTracker.SetGPUTrackerHitsMemory(tmpMem, tracker->NHitsTotal());
-       tmpMem = alignPointer(tmpMem, 1024 * 1024);
+       if (sliceCountLocal == -1) sliceCountLocal = this->fSliceCount;
 
-       if (DebugLevel >= 4) printf("Initialising GPU Slice Data Memory\n");
-       tmpMem = gpuTracker.fData.SetGPUSliceDataMemory(tmpMem, gpuTracker.fClusterData);
-       tmpMem = alignPointer(tmpMem, 1024 * 1024);
-       if (tmpMem - (char*) GPUMemory > GPUMemSize)
+       if (sliceCountLocal * sizeof(AliHLTTPCCATracker) > HLTCA_GPU_TRACKER_CONSTANT_MEM)
        {
-               printf("Out of CUDA Memory\n");
+               HLTError("Insuffissant constant memory (Required %d, Available %d, Tracker %d, Param %d, SliceData %d)", sliceCountLocal * (int) sizeof(AliHLTTPCCATracker), (int) HLTCA_GPU_TRACKER_CONSTANT_MEM, (int) sizeof(AliHLTTPCCATracker), (int) sizeof(AliHLTTPCCAParam), (int) sizeof(AliHLTTPCCASliceData));
                return(1);
        }
-       
-       CUDA_FAILED_MSG(cudaMemcpy(gpuTracker.fCommonMemory, tracker->fCommonMemory, tracker->fCommonMemorySize, cudaMemcpyHostToDevice));
-       CUDA_FAILED_MSG(cudaMemcpy(gpuTracker.fData.fMemory, tracker->fData.fMemory, tracker->fData.fMemorySize, cudaMemcpyHostToDevice));
-       CUDA_FAILED_MSG(cudaMemcpyToSymbol(gAliHLTTPCCATracker, &gpuTracker, sizeof(AliHLTTPCCATracker)));
-
-       if (DebugLevel >= 4) printf("Running GPU Neighbours Finder\n");
-       AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<gpuTracker.Param().NRows(), 256>>>();
-       if (CUDASync()) return 1;
 
-       if (DebugLevel >= 3)
+       if (fDebugLevel >= 4)
        {
-               *OutFile << "Neighbours Finder:" << endl;
-               CUDA_FAILED_MSG(cudaMemcpy(tracker->fData.fMemory, gpuTracker.fData.fMemory, tracker->fData.fMemorySize, cudaMemcpyDeviceToHost));
-               tracker->DumpLinks(*OutFile);
-    }
+               for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+               {
+                       *fOutFile << endl << endl << "Slice: " << fSlaveTrackers[firstSlice + iSlice].Param().ISlice() << endl;
+               }
+       }
 
-       if (DebugLevel >= 4) printf("Running GPU Neighbours Cleaner\n");
-       AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<gpuTracker.Param().NRows()-2, 256>>>();
-       if (CUDASync()) return 1;
+       memcpy(fGpuTracker, &fSlaveTrackers[firstSlice], sizeof(AliHLTTPCCATracker) * sliceCountLocal);\r
 
-       if (DebugLevel >= 3)
+       if (fDebugLevel >= 2) HLTInfo("Running GPU Tracker (Slices %d to %d)", fSlaveTrackers[firstSlice].Param().ISlice(), fSlaveTrackers[firstSlice + sliceCountLocal].Param().ISlice());
+       if (fDebugLevel >= 5) HLTInfo("Allocating GPU Tracker memory and initializing constants");
+       
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
        {
-               *OutFile << "Neighbours Cleaner:" << endl;
-               CUDA_FAILED_MSG(cudaMemcpy(tracker->fData.fMemory, gpuTracker.fData.fMemory, tracker->fData.fMemorySize, cudaMemcpyDeviceToHost));
-               tracker->DumpLinks(*OutFile);
-    }
+               //Make this a GPU Tracker
+               fGpuTracker[iSlice].SetGPUTracker();
+               fGpuTracker[iSlice].SetGPUTrackerCommonMemory((char*) CommonMemory(fGPUMemory, iSlice));
+               fGpuTracker[iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fGPUMemory, iSlice), RowMemory(fGPUMemory, iSlice));
+               fGpuTracker[iSlice].pData()->SetPointers(&pClusterData[iSlice], false);
+
+               //Set Pointers to GPU Memory
+               char* tmpMem = (char*) GlobalMemory(fGPUMemory, iSlice);
+
+               if (fDebugLevel >= 5) HLTInfo("Initialising GPU Hits Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerHitsMemory(tmpMem, pClusterData[iSlice].NumberOfClusters());
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);
+
+               if (fDebugLevel >= 5) HLTInfo("Initialising GPU Tracklet Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTrackletsMemory(tmpMem, HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/);
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);
+
+               if (fDebugLevel >= 5) HLTInfo("Initialising GPU Track Memory");
+               tmpMem = fGpuTracker[iSlice].SetGPUTrackerTracksMemory(tmpMem, HLTCA_GPU_MAX_TRACKS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets()*/, pClusterData[iSlice].NumberOfClusters());
+               tmpMem = alignPointer(tmpMem, 1024 * 1024);
+
+               if (fGpuTracker[iSlice].TrackMemorySize() >= HLTCA_GPU_TRACKS_MEMORY)
+               {
+                       HLTError("Insufficiant Track Memory");
+                       return(1);
+               }
+
+               if (tmpMem - (char*) GlobalMemory(fGPUMemory, iSlice) > HLTCA_GPU_GLOBAL_MEMORY)
+               {
+                       HLTError("Insufficiant Global Memory");
+                       return(1);
+               }
+
+               //Initialize Startup Constants
+               *fSlaveTrackers[firstSlice + iSlice].NTracklets() = 0;
+               *fSlaveTrackers[firstSlice + iSlice].NTracks() = 0;
+               *fSlaveTrackers[firstSlice + iSlice].NTrackHits() = 0;
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount = HLTCA_GPU_BLOCK_COUNT * (iSlice + 1) / sliceCountLocal - HLTCA_GPU_BLOCK_COUNT * (iSlice) / sliceCountLocal;
+               if (fDebugLevel >= 5) HLTInfo("Blocks for Slice %d: %d", iSlice, fGpuTracker[iSlice].GPUParametersConst()->fGPUFixedBlockCount);
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUiSlice = iSlice;
+               fGpuTracker[iSlice].GPUParametersConst()->fGPUnSlices = sliceCountLocal;
+               fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError = 0;
+               fGpuTracker[iSlice].pData()->SetGPUTextureBase(fGpuTracker[0].Data().Memory());
+       }
 
-       if (DebugLevel >= 4) printf("Running GPU Start Hits Finder\n");
-       AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<gpuTracker.Param().NRows()-4, 256>>>();
-       if (CUDASync()) return 1;
+#ifdef HLTCA_GPU_TEXTURE_FETCH
+               cudaChannelFormatDesc channelDescu2 = cudaCreateChannelDesc<ushort2>();\r
+               size_t offset;\r
+               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu2, fGpuTracker[0].Data().Memory(), &channelDescu2, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
+               {
+                       HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+                       return(1);
+               }
+               cudaChannelFormatDesc channelDescu = cudaCreateChannelDesc<unsigned short>();
+               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefu, fGpuTracker[0].Data().Memory(), &channelDescu, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
+               {
+                       HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+                       return(1);
+               }
+               cudaChannelFormatDesc channelDescs = cudaCreateChannelDesc<signed short>();
+               if (CudaFailedMsg(cudaBindTexture(&offset, &gAliTexRefs, fGpuTracker[0].Data().Memory(), &channelDescs, sliceCountLocal * HLTCA_GPU_SLICE_DATA_MEMORY)) || offset)\r
+               {
+                       HLTError("Error binding CUDA Texture (Offset %d)", (int) offset);
+                       return(1);
+               }
+#endif
 
-       if (DebugLevel >= 4) printf("Obtaining Number of Start Hits from GPU: ");
-       CUDA_FAILED_MSG(cudaMemcpy(tracker->fCommonMemory, gpuTracker.fCommonMemory, tracker->fCommonMemorySize, cudaMemcpyDeviceToHost));
-       if (DebugLevel >= 4) printf("%d\n", *tracker->NTracklets());
-       else if (DebugLevel >= 2) printf("%3d ", *tracker->NTracklets());
+       //Copy Tracker Object to GPU Memory
+       if (fDebugLevel >= 5) HLTInfo("Copying Tracker objects to GPU");
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+       if (CudaFailedMsg(cudaMalloc(&fGpuTracker[0].fStageAtSync, 100000000))) return(1);
+       CudaFailedMsg(cudaMemset(fGpuTracker[0].fStageAtSync, 0, 100000000));
+#endif
+       CudaFailedMsg(cudaMemcpyToSymbolAsync(gAliHLTTPCCATracker, fGpuTracker, sizeof(AliHLTTPCCATracker) * sliceCountLocal, 0, cudaMemcpyHostToDevice, cudaStreams[0]));
 
-       if (DebugLevel >= 3)
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
        {
-               *OutFile << "Start Hits: (" << *tracker->NTracklets() << ")" << endl;
-               CUDA_FAILED_MSG(cudaMemcpy(tracker->fHitMemory, gpuTracker.fHitMemory, tracker->fHitMemorySize, cudaMemcpyDeviceToHost));
-               tracker->DumpStartHits(*OutFile);
-    }
-
-       /*tracker->RunNeighboursFinder();
-       tracker->RunNeighboursCleaner();
-       tracker->RunStartHitsFinder();*/
-
-       if (DebugLevel >= 4) printf("Initialising GPU Track Memory\n");
-       tmpMem = gpuTracker.SetGPUTrackerTracksMemory(tmpMem, *tracker->NTracklets(), tracker->NHitsTotal());
-       tmpMem = alignPointer(tmpMem, 1024 * 1024);
-       if (tmpMem - (char*) GPUMemory > GPUMemSize)
+               StandalonePerfTime(firstSlice + iSlice, 0);
+
+               //Initialize GPU Slave Tracker
+               if (fDebugLevel >= 5) HLTInfo("Creating Slice Data");
+               fSlaveTrackers[firstSlice + iSlice].pData()->SetGPUSliceDataMemory(SliceDataMemory(fHostLockedMemory, iSlice), RowMemory(fHostLockedMemory, firstSlice + iSlice));
+               fSlaveTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
+               if (fSlaveTrackers[firstSlice + iSlice].Data().MemorySize() > HLTCA_GPU_SLICE_DATA_MEMORY)
+               {
+                       HLTError("Insufficiant Slice Data Memory");
+                       return(1);
+               }
+
+               /*if (fSlaveTrackers[firstSlice + iSlice].CheckEmptySlice())
+               {
+                       if (fDebugLevel >= 5) HLTInfo("Slice Empty, not running GPU Tracker");
+                       if (sliceCountLocal == 1)
+                               return(0);
+               }*/
+
+               //Initialize temporary memory where needed
+               if (fDebugLevel >= 5) HLTInfo("Copying Slice Data to GPU and initializing temporary memory");           
+               PreInitRowBlocks<<<30, 256, 0, cudaStreams[2]>>>(fGpuTracker[iSlice].RowBlockPos(), fGpuTracker[iSlice].RowBlockTracklets(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign());
+
+               //Copy Data to GPU Global Memory
+               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fSlaveTrackers[firstSlice + iSlice].CommonMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
+               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
+               CudaFailedMsg(cudaMemcpyAsync(fGpuTracker[iSlice].SliceDataRows(), fSlaveTrackers[firstSlice + iSlice].SliceDataRows(), (HLTCA_ROW_COUNT + 1) * sizeof(AliHLTTPCCARow), cudaMemcpyHostToDevice, cudaStreams[iSlice & 1]));
+
+               if (fDebugLevel >= 4)
+               {
+                       if (fDebugLevel >= 5) HLTInfo("Allocating Debug Output Memory");
+                       fSlaveTrackers[firstSlice + iSlice].TrackletMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].TrackletMemorySize()/sizeof( uint4 ) + 100] );
+                       fSlaveTrackers[firstSlice + iSlice].SetPointersTracklets( HLTCA_GPU_MAX_TRACKLETS );
+                       fSlaveTrackers[firstSlice + iSlice].HitMemory() = reinterpret_cast<char*> ( new uint4 [ fGpuTracker[iSlice].HitMemorySize()/sizeof( uint4 ) + 100] );
+                       fSlaveTrackers[firstSlice + iSlice].SetPointersHits( pClusterData[iSlice].NumberOfClusters() );
+               }
+               
+               if (CUDASync("Initialization")) return(1);
+               StandalonePerfTime(firstSlice + iSlice, 1);
+
+               if (fDebugLevel >= 5) HLTInfo("Running GPU Neighbours Finder");
+               AliHLTTPCCAProcess<AliHLTTPCCANeighboursFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows(), 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
+
+               if (CUDASync("Neighbours finder")) return 1;
+
+               StandalonePerfTime(firstSlice + iSlice, 2);
+
+               if (fDebugLevel >= 4)
+               {
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
+                       fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
+               }
+
+               if (fDebugLevel >= 5) HLTInfo("Running GPU Neighbours Cleaner");
+               AliHLTTPCCAProcess<AliHLTTPCCANeighboursCleaner> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-2, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
+               if (CUDASync("Neighbours Cleaner")) return 1;
+
+               StandalonePerfTime(firstSlice + iSlice, 3);
+
+               if (fDebugLevel >= 4)
+               {
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().Memory(), fGpuTracker[iSlice].Data().Memory(), fSlaveTrackers[firstSlice + iSlice].Data().GpuMemorySize(), cudaMemcpyDeviceToHost));
+                       fSlaveTrackers[firstSlice + iSlice].DumpLinks(*fOutFile);
+               }
+
+               if (fDebugLevel >= 5) HLTInfo("Running GPU Start Hits Finder");
+               AliHLTTPCCAProcess<AliHLTTPCCAStartHitsFinder> <<<fSlaveTrackers[firstSlice + iSlice].Param().NRows()-6, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
+               if (CUDASync("Start Hits Finder")) return 1;
+
+               StandalonePerfTime(firstSlice + iSlice, 4);
+
+               if (fDebugLevel >= 5) HLTInfo("Running GPU Start Hits Sorter");
+               AliHLTTPCCAProcess<AliHLTTPCCAStartHitsSorter> <<<30, 256, 0, cudaStreams[iSlice & 1]>>>(iSlice);
+               if (CUDASync("Start Hits Sorter")) return 1;
+
+               StandalonePerfTime(firstSlice + iSlice, 5);
+
+               if (fDebugLevel >= 2)
+               {
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
+                       if (fDebugLevel >= 5) HLTInfo("Obtaining Number of Start Hits from GPU: %d", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
+                       if (*fSlaveTrackers[firstSlice + iSlice].NTracklets() > HLTCA_GPU_MAX_TRACKLETS)
+                       {
+                               HLTError("HLTCA_GPU_MAX_TRACKLETS constant insuffisant");
+                               return(1);
+                       }
+               }
+
+               if (fDebugLevel >= 4)
+               {
+                       *fOutFile << "Temporary ";
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletStartHits(), fGpuTracker[iSlice].TrackletTmpStartHits(), pClusterData[iSlice].NumberOfClusters() * sizeof(AliHLTTPCCAHitId), cudaMemcpyDeviceToHost));
+                       fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
+                       uint3* tmpMemory = (uint3*) malloc(sizeof(uint3) * fSlaveTrackers[firstSlice + iSlice].Param().NRows());
+                       CudaFailedMsg(cudaMemcpy(tmpMemory, fGpuTracker[iSlice].RowStartHitCountOffset(), fSlaveTrackers[firstSlice + iSlice].Param().NRows() * sizeof(uint3), cudaMemcpyDeviceToHost));
+                       *fOutFile << "Start Hits Sort Vector:" << std::endl;
+                       for (int i = 0;i < fSlaveTrackers[firstSlice + iSlice].Param().NRows();i++)
+                       {
+                               *fOutFile << "Row: " << i << ", Len: " << tmpMemory[i].x << ", Offset: " << tmpMemory[i].y << ", New Offset: " << tmpMemory[i].z << std::endl;
+                       }
+                       free(tmpMemory);
+
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fSlaveTrackers[firstSlice + iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
+                       fSlaveTrackers[firstSlice + iSlice].DumpStartHits(*fOutFile);
+               }
+
+               StandalonePerfTime(firstSlice + iSlice, 6);
+               
+               fSlaveTrackers[firstSlice + iSlice].SetGPUTrackerTracksMemory((char*) TracksMemory(fHostLockedMemory, iSlice), HLTCA_GPU_MAX_TRACKS, pClusterData[iSlice].NumberOfClusters());
+       }
+
+       StandalonePerfTime(firstSlice, 7);
+#ifdef HLTCA_GPU_PREFETCHDATA
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
        {
-               printf("Out of CUDA Memory\n");
-               return(1);
+               if (fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v) > ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4))
+               {
+                       HLTError("Insufficiant GPU shared Memory, required: %d, available %d", fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v), ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4 * sizeof(uint4));
+                       return(1);
+               }
+               if (fDebugLevel >= 1)
+               {
+                       static int infoShown = 0;
+                       if (!infoShown)
+                       {
+                               HLTInfo("GPU Shared Memory Cache Size: %d", 2 * fSlaveTrackers[firstSlice + iSlice].Data().GPUSharedDataReq() * sizeof(ushort_v));
+                               infoShown = 1;
+                       }
+               }
        }
+#endif
 
-       tracker->fData.ClearHitWeights();
-       CUDA_FAILED_MSG(cudaMemcpy(gpuTracker.fData.fHitWeights, tracker->fData.fHitWeights, tracker->fData.fNumberOfHits * sizeof(int), cudaMemcpyHostToDevice));
-       CUDA_FAILED_MSG(cudaMemcpyToSymbol(gAliHLTTPCCATracker, &gpuTracker, sizeof(AliHLTTPCCATracker)));
+       if (fDebugLevel >= 5) HLTInfo("Initialising Tracklet Constructor Scheduler");
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+       {
+               AliHLTTPCCATrackletConstructorInit<<<HLTCA_GPU_MAX_TRACKLETS /* *fSlaveTrackers[firstSlice + iSlice].NTracklets() */ / HLTCA_GPU_THREAD_COUNT + 1, HLTCA_GPU_THREAD_COUNT>>>(iSlice);
+               if (CUDASync("Tracklet Initializer")) return 1;
+               DumpRowBlocks(fSlaveTrackers, iSlice);
+       }
 
-       if (DebugLevel >= 4) printf("Initialising Slice Tracker (CPU) Track Memory\n");
-       tracker->fTrackMemory = reinterpret_cast<char*> ( new uint4 [ gpuTracker.fTrackMemorySize/sizeof( uint4 ) + 100] );
-    tracker->SetPointersTracks( *tracker->NTracklets(), tracker->NHitsTotal() );
+       if (fDebugLevel >= 5) HLTInfo("Running GPU Tracklet Constructor");
+       AliHLTTPCCATrackletConstructorNewGPU<<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT>>>();
+       if (CUDASync("Tracklet Constructor (new)")) return 1;
+       
+       StandalonePerfTime(firstSlice, 8);
 
-/*     tracker->RunTrackletConstructor();
-       if (DebugLevel >= 3)
+       if (fDebugLevel >= 4)
        {
-               *OutFile << "Tracklet Hits:" << endl;
-               tracker->DumpTrackletHits(*OutFile);
-       }*/
-
-       int nMemThreads = TRACKLET_CONSTRUCTOR_NMEMTHREDS;
-    nThreads = 256;//96;
-    nBlocks = *tracker->NTracklets()/nThreads + 1;
-    if( nBlocks<30 ){
-               nBlocks = 30;
-               nThreads = (*tracker->NTracklets())/nBlocks+1;
-               if( nThreads%32 ) nThreads = (nThreads/32+1)*32;
+               for (int iSlice = 0;iSlice < sliceCountLocal;iSlice++)
+               {
+                       DumpRowBlocks(&fSlaveTrackers[firstSlice], iSlice, false);
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemory(), fGpuTracker[iSlice].CommonMemorySize(), cudaMemcpyDeviceToHost));
+                       if (fDebugLevel >= 5)
+                       {
+                               HLTInfo("Obtained %d tracklets", *fSlaveTrackers[firstSlice + iSlice].NTracklets());
+                       }
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemory(), fGpuTracker[iSlice].TrackletMemorySize(), cudaMemcpyDeviceToHost));
+                       CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].HitMemory(), fGpuTracker[iSlice].HitMemory(), fGpuTracker[iSlice].HitMemorySize(), cudaMemcpyDeviceToHost));
+                       fSlaveTrackers[firstSlice + iSlice].DumpTrackletHits(*fOutFile);
+               }
        }
-       if (DebugLevel >= 4) printf("Running GPU Tracklet Constructor\n");
 
-       //AliHLTTPCCAProcess1<AliHLTTPCCATrackletConstructor> <<<nBlocks, nMemThreads+nThreads>>>(); 
-       AliHLTTPCCAProcess1<AliHLTTPCCATrackletConstructor> <<<1, TRACKLET_CONSTRUCTOR_NMEMTHREDS + *tracker->fNTracklets>>>();
-       if (CUDASync()) return 1;
+       for (int iSlice = 0;iSlice < sliceCountLocal;iSlice += HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT)
+       {
+               if (fDebugLevel >= 5) HLTInfo("Running HLT Tracklet selector (Slice %d to %d)", iSlice, iSlice + HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT);
+               AliHLTTPCCAProcessMulti<AliHLTTPCCATrackletSelector><<<HLTCA_GPU_BLOCK_COUNT, HLTCA_GPU_THREAD_COUNT, 0, cudaStreams[iSlice]>>>(iSlice, CAMath::Min(HLTCA_GPU_TRACKLET_SELECTOR_SLICE_COUNT, sliceCountLocal - iSlice));
+       }
+       if (CUDASync("Tracklet Selector")) return 1;
+       StandalonePerfTime(firstSlice, 9);
 
-       if (DebugLevel >= 3)
+       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + 0].CommonMemory(), fGpuTracker[0].CommonMemory(), fGpuTracker[0].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[0]));
+       for (int iSliceTmp = 0;iSliceTmp <= sliceCountLocal;iSliceTmp++)
        {
-               *OutFile << "Tracklet Hits:" << endl;
-               CUDA_FAILED_MSG(cudaMemcpy(tracker->fNTracklets, gpuTracker.fNTracklets, sizeof(int), cudaMemcpyDeviceToHost));
-               CUDA_FAILED_MSG(cudaMemcpy(tracker->fTracklets, gpuTracker.fTracklets, gpuTracker.fTrackMemorySize, cudaMemcpyDeviceToHost));
-               tracker->DumpTrackletHits(*OutFile);
-    }
+               if (iSliceTmp < sliceCountLocal)
+               {
+                       int iSlice = iSliceTmp;
+                       if (fDebugLevel >= 5) HLTInfo("Transfering Tracks from GPU to Host");
+                       cudaStreamSynchronize(cudaStreams[iSlice]);
+                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].Tracks(), fGpuTracker[iSlice].Tracks(), sizeof(AliHLTTPCCATrack) * *fSlaveTrackers[firstSlice + iSlice].NTracks(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
+                       CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice].TrackHits(), fGpuTracker[iSlice].TrackHits(), sizeof(AliHLTTPCCAHitId) * *fSlaveTrackers[firstSlice + iSlice].NTrackHits(), cudaMemcpyDeviceToHost, cudaStreams[iSlice]));
+                       if (iSlice + 1 < sliceCountLocal)
+                               CudaFailedMsg(cudaMemcpyAsync(fSlaveTrackers[firstSlice + iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemory(), fGpuTracker[iSlice + 1].CommonMemorySize(), cudaMemcpyDeviceToHost, cudaStreams[iSlice + 1]));
+               }
+
+               if (iSliceTmp)
+               {
+                       int iSlice = iSliceTmp - 1;
+                       cudaStreamSynchronize(cudaStreams[iSlice]);
+
+                       if (fDebugLevel >= 4)
+                       {
+                               CudaFailedMsg(cudaMemcpy(fSlaveTrackers[firstSlice + iSlice].Data().HitWeights(), fGpuTracker[iSlice].Data().HitWeights(), fSlaveTrackers[firstSlice + iSlice].Data().NumberOfHitsPlusAlign() * sizeof(int), cudaMemcpyDeviceToHost));
+                               fSlaveTrackers[firstSlice + iSlice].DumpHitWeights(*fOutFile);
+                               fSlaveTrackers[firstSlice + iSlice].DumpTrackHits(*fOutFile);
+                       }
+
+                       if (fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError)
+                       {
+                               HLTError("GPU Tracker returned Error Code %d", fSlaveTrackers[firstSlice + iSlice].GPUParameters()->fGPUError);
+                               return(1);
+                       }
+                       if (fDebugLevel >= 5) HLTInfo("Tracks Transfered: %d / %d", *fSlaveTrackers[firstSlice + iSlice].NTracks(), *fSlaveTrackers[firstSlice + iSlice].NTrackHits());
+
+                       fSlaveTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
+                       fSlaveTrackers[firstSlice + iSlice].WriteOutput();\r
+
+                       if (fDebugLevel >= 4)
+                       {
+                               delete[] fSlaveTrackers[firstSlice + iSlice].HitMemory();
+                               delete[] fSlaveTrackers[firstSlice + iSlice].TrackletMemory();
+                       }
+               }
+       }
+
+       StandalonePerfTime(firstSlice, 10);
+
+       if (fDebugLevel >= 5) HLTInfo("GPU Reconstruction finished");
 
-       //tracker->RunTrackletSelector();
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+       char* stageAtSync = (char*) malloc(100000000);
+       CudaFailedMsg(cudaMemcpy(stageAtSync, fGpuTracker[0].fStageAtSync, 100 * 1000 * 1000, cudaMemcpyDeviceToHost));
+       cudaFree(fGpuTracker[0].fStageAtSync);
+
+       FILE* fp = fopen("profile.txt", "w+");
+       FILE* fp2 = fopen("profile.bmp", "w+b");
+       int nEmptySync = 0, fEmpty;
+
+       const int bmpheight = 1000;
+       BITMAPFILEHEADER bmpFH;
+       BITMAPINFOHEADER bmpIH;
+       ZeroMemory(&bmpFH, sizeof(bmpFH));
+       ZeroMemory(&bmpIH, sizeof(bmpIH));
        
+       bmpFH.bfType = 19778; //"BM"
+       bmpFH.bfSize = sizeof(bmpFH) + sizeof(bmpIH) + (HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1) * bmpheight ;
+       bmpFH.bfOffBits = sizeof(bmpFH) + sizeof(bmpIH);
 
-       nThreads = 128;
-       nBlocks = *tracker->NTracklets()/nThreads + 1;
-       if( nBlocks<30 ){
-         nBlocks = 30;  
-         nThreads = *tracker->NTracklets()/nBlocks+1;
-         nThreads = (nThreads/32+1)*32;
-       }
-       if (DebugLevel >= 4) printf("Running GPU Tracklet Selector\n");
-       AliHLTTPCCAProcess<AliHLTTPCCATrackletSelector><<<nBlocks, nThreads>>>();
-       //AliHLTTPCCAProcess<AliHLTTPCCATrackletSelector><<<1, *tracker->fNTracklets>>>();
-       if (CUDASync()) return 1;
-
-       if (DebugLevel >= 4) printf("Transfering Tracks from GPU to Host ");
-       CUDA_FAILED_MSG(cudaMemcpy(tracker->NTracks(), gpuTracker.NTracks(), sizeof(int), cudaMemcpyDeviceToHost));
-       CUDA_FAILED_MSG(cudaMemcpy(tracker->NTrackHits(), gpuTracker.NTrackHits(), sizeof(int), cudaMemcpyDeviceToHost));
-       if (DebugLevel >= 4) printf("%d / %d\n", *tracker->fNTracks, *tracker->fNTrackHits);
-       size = sizeof(AliHLTTPCCATrack) * *tracker->NTracks();
-       CUDA_FAILED_MSG(cudaMemcpy(tracker->Tracks(), gpuTracker.Tracks(), size, cudaMemcpyDeviceToHost));
-       size = sizeof(AliHLTTPCCAHitId) * *tracker->NTrackHits();
-       if (CUDA_FAILED_MSG(cudaMemcpy(tracker->TrackHits(), gpuTracker.TrackHits(), size, cudaMemcpyDeviceToHost)))
+       bmpIH.biSize = sizeof(bmpIH);
+       bmpIH.biWidth = HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT / 32 * 33 - 1;
+       bmpIH.biHeight = bmpheight;
+       bmpIH.biPlanes = 1;
+       bmpIH.biBitCount = 32;
+
+       fwrite(&bmpFH, 1, sizeof(bmpFH), fp2);
+       fwrite(&bmpIH, 1, sizeof(bmpIH), fp2);  
+
+       for (int i = 0;i < bmpheight * HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;i += HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT)
        {
-               printf("CUDA Error during Reconstruction\n");
-               return(1);
+               fEmpty = 1;
+               for (int j = 0;j < HLTCA_GPU_BLOCK_COUNT * HLTCA_GPU_THREAD_COUNT;j++)
+               {
+                       fprintf(fp, "%d\t", stageAtSync[i + j]);
+                       int color = 0;
+                       if (stageAtSync[i + j] == 1) color = RGB(255, 0, 0);
+                       if (stageAtSync[i + j] == 2) color = RGB(0, 255, 0);
+                       if (stageAtSync[i + j] == 3) color = RGB(0, 0, 255);
+                       if (stageAtSync[i + j] == 4) color = RGB(255, 255, 0);
+                       fwrite(&color, 1, sizeof(int), fp2);
+                       if (j > 0 && j % 32 == 0)
+                       {
+                               color = RGB(255, 255, 255);
+                               fwrite(&color, 1, 4, fp2);
+                       }
+                       if (stageAtSync[i + j]) fEmpty = 0;
+               }
+               fprintf(fp, "\n");
+               if (fEmpty) nEmptySync++;
+               else nEmptySync = 0;
+               //if (nEmptySync == HLTCA_GPU_SCHED_ROW_STEP + 2) break;
        }
 
-       if (DebugLevel >= 3)
-       {
-               *OutFile << "Track Hits: (" << *tracker->NTracks() << ")" << endl;
-               tracker->DumpTrackHits(*OutFile);
-    }
+       fclose(fp);
+       fclose(fp2);
+       free(stageAtSync);
+#endif 
 
-       if (DebugLevel >= 4) printf("Running WriteOutput\n");
-       tracker->WriteOutput();
+       return(0);
+}
 
-       if (DebugLevel >= 4) printf("GPU Reconstruction finished\n");
-       
+int AliHLTTPCCAGPUTracker::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
+{
+       //Initialize Slice Tracker Parameter for a slave tracker
+       fSlaveTrackers[iSlice].Initialize(param);
+       if (fSlaveTrackers[iSlice].Param().NRows() != HLTCA_ROW_COUNT)
+       {
+               HLTError("Error, Slice Tracker %d Row Count of %d exceeds Constant of %d", iSlice, fSlaveTrackers[iSlice].Param().NRows(), HLTCA_ROW_COUNT);
+               return(1);
+       }
        return(0);
 }
 
 int AliHLTTPCCAGPUTracker::ExitGPU()
 {
-       cudaFree(GPUMemory);
+       //Uninitialize CUDA
+       cudaThreadSynchronize();
+       if (fGPUMemory)
+       {
+               cudaFree(fGPUMemory);
+               fGPUMemory = NULL;
+       }
+       if (fHostLockedMemory)
+       {
+               for (int i = 0;i < CAMath::Max(3, fSliceCount);i++)
+               {
+                       cudaStreamDestroy(((cudaStream_t*) fpCudaStreams)[i]);
+               }
+               free(fpCudaStreams);
+               fGpuTracker = NULL;
+               cudaFreeHost(fHostLockedMemory);
+       }
+
+       if (CudaFailedMsg(cudaThreadExit()))
+       {
+               HLTError("Could not uninitialize GPU");
+               return(1);
+       }
+       HLTInfo("CUDA Uninitialized");
        return(0);
 }
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu.patch b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cu.patch
new file mode 100644 (file)
index 0000000..de94951
--- /dev/null
@@ -0,0 +1,11 @@
+--- AliHLTTPCCAGPUTracker.cucpp        2009-05-28 12:14:09.000000000 +0200
++++ release/x86_64-pc-linux-gnu/code/AliHLTTPCCAGPUTracker.cucpp       2009-05-28 12:10:25.000000000 +0200
+@@ -23186,7 +23186,7 @@
+ static T2 *Alloc(int s) { auto T2 *p = (reinterpret_cast< T2 *>(_mm_malloc(s * sizeof(CacheLineSizeHelper< T> ), 128))); return new (p) T2 [s]; }
+ static void Free(T2 *const p, int size) {
+ for (int i = 0; i < size; ++i) {
+-((p[i]).~CacheLineSizeHelper());
++((p[i]).~T2());
+ }
+ _mm_free(p);
+ }
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAGPUTracker.cxx
new file mode 100644 (file)
index 0000000..5c8f67a
--- /dev/null
@@ -0,0 +1,37 @@
+// **************************************************************************
+// This file is property of and copyright by the ALICE HLT Project          *
+// ALICE Experiment at CERN, All rights reserved.                           *
+//                                                                          *
+// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
+//                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
+//                                     David Rohr <drohr@kip.uni-heidelberg.de>                                *
+//                  for The ALICE HLT Project.                              *
+//                                                                          *
+// Permission to use, copy, modify and distribute this software and its     *
+// documentation strictly for non-commercial purposes is hereby granted     *
+// without fee, provided that the above copyright notice appears in all     *
+// copies and that both the copyright notice and this permission notice     *
+// appear in the supporting documentation. The authors make no claims       *
+// about the suitability of this software for any purpose. It is            *
+// provided "as is" without express or implied warranty.                    *
+//                                                                          *
+//***************************************************************************
+
+//If not building GPU Code then build dummy functions to link against
+#include "AliHLTTPCCAGPUTracker.h"
+
+int AliHLTTPCCAGPUTracker::InitGPU(int /*sliceCount*/, int /*forceDeviceID*/)
+{
+    //Dummy init function if CUDA is not available
+    HLTInfo("CUDA Compiler was not available during build process, omitting CUDA initialization");
+    return(1);
+}
+void StandalonePerfTime(int /*iSlice*/, int /*i*/) {}
+//template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment) {return(NULL);}
+//bool AliHLTTPCCAGPUTracker::CudaFailedMsg(cudaError_t error) {return(true);}
+//int AliHLTTPCCAGPUTracker::CUDASync() {return(1);}
+void AliHLTTPCCAGPUTracker::SetDebugLevel(int /*dwLevel*/, std::ostream* /*NewOutFile*/) {}
+int AliHLTTPCCAGPUTracker::SetGPUTrackerOption(char* /*OptionName*/, int /*OptionValue*/) {return(1);}
+int Reconstruct(AliHLTTPCCASliceOutput* /*pTracker*/, AliHLTTPCCAClusterData* /*pClusterData*/, int /*fFirstSlice*/, int /*fSliceCount*/) {return(1);}
+int AliHLTTPCCAGPUTracker::ExitGPU() {return(0);}
+int AliHLTTPCCAGPUTracker::InitializeSliceParam(int /*iSlice*/, AliHLTTPCCAParam& /*param*/) {}
index b44819f..fd432e5 100644 (file)
@@ -5,36 +5,79 @@
 //                                                                        *
 //*************************************************************************
 
+#ifndef ALIHLTTPCCAGPUTRACKER_H
+#define ALIHLTTPCCAGPUTRACKER_H
+
 #include "AliHLTTPCCADef.h"
 #include "AliHLTTPCCATracker.h"
+#include "AliHLTLogging.h"
+
+class AliHLTTPCCARow;
 
-class AliHLTTPCCAGPUTracker
+class AliHLTTPCCAGPUTracker : AliHLTLogging
 {
 public:
-       AliHLTTPCCAGPUTracker();
-       ~AliHLTTPCCAGPUTracker();
+       AliHLTTPCCAGPUTracker() :
+         fGpuTracker(NULL),
+         fGPUMemory(NULL),
+         fHostLockedMemory(NULL),
+         fDebugLevel(0),
+         fOutFile(NULL),
+         fGPUMemSize(0),
+         fpCudaStreams(NULL),
+         fSliceCount(0)
+         {};
+         ~AliHLTTPCCAGPUTracker() {};
 
-       int InitGPU();
-       int Reconstruct(AliHLTTPCCATracker* tracker);
+       int InitGPU(int sliceCount = 1, int forceDeviceID = -1);
+       int Reconstruct(AliHLTTPCCASliceOutput* pOutput, AliHLTTPCCAClusterData* pClusterData, int fFirstSlice, int fSliceCount = -1);
        int ExitGPU();
 
-       void SetDebugLevel(int dwLevel, std::ostream *NewOutFile = NULL);
+       void SetDebugLevel(const int dwLevel, std::ostream* const NewOutFile = NULL);
+       int SetGPUTrackerOption(char* OptionName, int OptionValue);
+
+       unsigned long long int* PerfTimer(int iSlice, unsigned int i) {return(fSlaveTrackers ? fSlaveTrackers[iSlice].PerfTimer(i) : NULL); }
+
+       int InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param);
 
 private:
-       AliHLTTPCCATracker gpuTracker;
-       void* GPUMemory;
+       static void* RowMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + iSlice * sizeof(AliHLTTPCCARow) * (HLTCA_ROW_COUNT + 1) ); }
+       static void* CommonMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + iSlice * AliHLTTPCCATracker::CommonMemorySize() ); }
+       static void* SliceDataMemory(void* const BaseMemory, int iSlice) { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + iSlice * HLTCA_GPU_SLICE_DATA_MEMORY ); }
+       void* GlobalMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_GLOBAL_MEMORY ); }
+       void* TracksMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY) + iSlice * HLTCA_GPU_TRACKS_MEMORY ); }
+       void* TrackerMemory(void* const BaseMemory, int iSlice) const { return( ((char*) BaseMemory) + HLTCA_GPU_ROWS_MEMORY + HLTCA_GPU_COMMON_MEMORY + fSliceCount * (HLTCA_GPU_SLICE_DATA_MEMORY + HLTCA_GPU_TRACKS_MEMORY) + iSlice * sizeof(AliHLTTPCCATracker) ); }
+
+       void DumpRowBlocks(AliHLTTPCCATracker* tracker, int iSlice, bool check = true);
+
+       AliHLTTPCCATracker *fGpuTracker;
+       void* fGPUMemory;
+       void* fHostLockedMemory;
 
-       int CUDASync();
+       int CUDASync(char* state = "UNKNOWN");
        template <class T> T* alignPointer(T* ptr, int alignment);
 
-       int DebugLevel;
-       std::ostream *OutFile;
-       int GPUMemSize;
+       void StandalonePerfTime(int iSlice, int i);
+
+       int fDebugLevel;                        //Debug Level for GPU Tracker
+       std::ostream* fOutFile;         //Debug Output Stream Pointer
+       unsigned long long int fGPUMemSize;     //Memory Size to allocate on GPU
+
+       void* fpCudaStreams;
+
+       int fSliceCount;
+
+       static const int fgkNSlices = 36;
+       AliHLTTPCCATracker fSlaveTrackers[fgkNSlices];
 #ifdef HLTCA_GPUCODE
-       bool CUDA_FAILED_MSG(cudaError_t error);
+       bool CudaFailedMsg(cudaError_t error);
 #endif
+
        // disable copy
        AliHLTTPCCAGPUTracker( const AliHLTTPCCAGPUTracker& );
        AliHLTTPCCAGPUTracker &operator=( const AliHLTTPCCAGPUTracker& );
 
+       ClassDef( AliHLTTPCCAGPUTracker, 0 )
 };
+
+#endif
\ No newline at end of file
index eacbaab..501f10a 100644 (file)
@@ -30,6 +30,7 @@
 
 GPUd() void AliHLTTPCCAGrid::CreateEmpty()
 {
+  //Create an empty grid
   fYMin = 0.f;
   fYMax = 1.f;
   fZMin = 0.f;
@@ -87,7 +88,7 @@ int AliHLTTPCCAGrid::GetBinBounded( float Y, float Z ) const
   return bin;
 }
 
-GPUd() void AliHLTTPCCAGrid::GetBin( float Y, float Z, int *bY, int *bZ ) const
+GPUd() void AliHLTTPCCAGrid::GetBin( float Y, float Z, int* const bY, int* const bZ ) const
 {
   //* get the bin pointer
 
index 7a8af5d..e08bf7d 100644 (file)
@@ -31,7 +31,7 @@ class AliHLTTPCCAGrid
      * returns -1 if the row is empty == no hits
      */
     GPUd() int GetBinBounded( float Y, float Z ) const;
-    GPUd() void GetBin( float Y, float Z, int *bY, int *bZ ) const;
+    GPUd() void GetBin( float Y, float Z, int* const bY, int* const bZ ) const;
 
     GPUd() unsigned int  N()        const { return fN;  }
     GPUd() unsigned int  Ny()       const { return fNy; }
@@ -43,10 +43,7 @@ class AliHLTTPCCAGrid
     GPUd() float StepYInv() const { return fStepYInv; }
     GPUd() float StepZInv() const { return fStepZInv; }
 
-#ifndef CUDA_DEVICE_EMULATION
   private:
-#endif
-
     unsigned int fNy;        //* N bins in Y
     unsigned int fNz;        //* N bins in Z
     unsigned int fN;         //* total N bins
index f2279c6..f010439 100644 (file)
@@ -29,10 +29,7 @@ class AliHLTTPCCAHit
     GPUhd() void SetY( float v ) { fY = v;    }
     GPUhd() void SetZ( float v ) { fZ = v;    }
 
-#ifndef CUDA_DEVICE_EMULATION
   protected:
-#endif
-
     float fY, fZ;       // Y and Z position of the TPC cluster
 
 };
index a37d358..549e232 100644 (file)
@@ -20,7 +20,7 @@
 #include "AliHLTTPCCATracker.h"
 #include "AliHLTTPCCAGrid.h"
 #include "AliHLTTPCCAHit.h"
-#include "AliHLTTPCCARow.h"
+class AliHLTTPCCARow;
 
 
 GPUd() void AliHLTTPCCAHitArea::Init( const AliHLTTPCCARow &row, const AliHLTTPCCASliceData &slice, float y, float z,
@@ -45,8 +45,13 @@ GPUd() void AliHLTTPCCAHitArea::Init( const AliHLTTPCCARow &row, const AliHLTTPC
   fIz = bZmin;
 
   // for given fIz (which is min atm.) get
+#ifdef HLTCA_GPU_TEXTURE_FETCHa
+  fHitYfst = tex1Dfetch(gAliTexRefu, ((char*) slice.FirstHitInBin(row) - slice.GPUTextureBaseConst()) / sizeof(unsigned short) + fIndYmin);
+  fHitYlst = tex1Dfetch(gAliTexRefu, ((char*) slice.FirstHitInBin(row) - slice.GPUTextureBaseConst()) / sizeof(unsigned short) + fIndYmin + fBDY);
+#else
   fHitYfst = slice.FirstHitInBin( row, fIndYmin ); // first and
   fHitYlst = slice.FirstHitInBin( row, fIndYmin + fBDY ); // last hit index in the bin
+#endif
   fIh = fHitYfst;
 }
 
@@ -72,13 +77,24 @@ GPUd() int AliHLTTPCCAHitArea::GetNext( const AliHLTTPCCATracker &tracker, const
       // go to next z and start y from the min again
       ++fIz;
       fIndYmin += fNy;
+#ifdef HLTCA_GPU_TEXTURE_FETCHa
+         fHitYfst = tex1Dfetch(gAliTexRefu, ((char*) slice.FirstHitInBin(row) - slice.GPUTextureBaseConst()) / sizeof(unsigned short) + fIndYmin);
+         fHitYlst = tex1Dfetch(gAliTexRefu, ((char*) slice.FirstHitInBin(row) - slice.GPUTextureBaseConst()) / sizeof(unsigned short) + fIndYmin + fBDY);
+#else
       fHitYfst = slice.FirstHitInBin( row, fIndYmin );
       fHitYlst = slice.FirstHitInBin( row, fIndYmin + fBDY );
+#endif
       fIh = fHitYfst;
     }
 
-    h->SetY( y0 + tracker.HitDataY( row, fIh ) * stepY );
+#ifdef HLTCA_GPU_TEXTURE_FETCHa
+       ushort2 tmpval = tex1Dfetch(gAliTexRefu2, ((char*) slice.HitData(row) - slice.GPUTextureBaseConst()) / sizeof(ushort2) + fIh);;
+       h->SetY( y0 + tmpval.x * stepY );
+    h->SetZ( z0 + tmpval.y * stepZ );
+#else
+       h->SetY( y0 + tracker.HitDataY( row, fIh ) * stepY );
     h->SetZ( z0 + tracker.HitDataZ( row, fIh ) * stepZ );
+#endif
 
     if ( 1 && ( h->Z() > fMaxZ || h->Z() < fMinZ || h->Y() < fMinY || h->Y() > fMaxY ) ) { //SG!!!
       fIh++;
index f1b8214..4084a2f 100644 (file)
@@ -59,10 +59,7 @@ class AliHLTTPCCAHitArea
     int  Ny() const { return fNy; }
     int  HitOffset() const { return fHitOffset; }
 
-#ifndef CUDA_DEVICE_EMULATION
   protected:
-#endif
-
     float fY;      // search coordinates
     float fZ;      // search coordinates
     float fMinZ;   // search coordinates
index 8501283..1d0cc2e 100644 (file)
@@ -24,9 +24,7 @@ class AliHLTTPCCAHitId
     GPUhd() int RowIndex() const { return fId & 0xff; }
     GPUhd() int HitIndex() const { return fId >> 8; }
 
-#ifndef CUDA_DEVICE_EMULATION
   private:
-#endif
     int fId;
 };
 
index 40907d7..9fa589d 100644 (file)
@@ -847,9 +847,9 @@ void AliHLTTPCCAMerger::Merging()
   }
 
   fOutput->SetNTracks( nOutTracks );
-#ifdef HLTCA_STANDALONE
+  #ifdef HLTCA_STANDALONE
   printf("Tracks Output: %d\n", nOutTracks);
-#endif
+  #endif
   fOutput->SetNTrackClusters( nOutTrackClusters );
   fOutput->SetPointers();
 
index 4987d8e..ad5c1bc 100644 (file)
@@ -33,10 +33,7 @@ class AliHLTTPCCANeighboursCleaner
         AliHLTTPCCASharedMemory& operator=( const AliHLTTPCCASharedMemory& /*dummy*/ ) { return *this; }
 #endif
 
-#ifndef CUDA_DEVICE_EMULATION
       protected:
-#endif
-
         int fIRow; // current row index
         int fIRowUp; // current row index
         int fIRowDn; // current row index
index 93fb8ee..715754f 100644 (file)
@@ -35,11 +35,27 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread
   //* find neighbours
 
   if ( iSync == 0 ) {
+#ifdef HLTCA_GPUCODE
+       for (int i = iThread;i < sizeof(AliHLTTPCCARow) / sizeof(int);i += nThreads)
+       {
+               reinterpret_cast<int*>(&s.fRow)[i] = reinterpret_cast<int*>(&tracker.SliceDataRows()[iBlock])[i];
+               if (iBlock >= 2 && iBlock <= tracker.Param().NRows() - 3)
+               {
+                       reinterpret_cast<int*>(&s.fRowUp)[i] = reinterpret_cast<int*>(&tracker.SliceDataRows()[iBlock + 2])[i];
+                       reinterpret_cast<int*>(&s.fRowDown)[i] = reinterpret_cast<int*>(&tracker.SliceDataRows()[iBlock - 2])[i];
+               }
+       }
+       __syncthreads();
+#endif
     if ( iThread == 0 ) {
       s.fNRows = tracker.Param().NRows();
       s.fIRow = iBlock;
       if ( s.fIRow < s.fNRows ) {
-        const AliHLTTPCCARow &row = tracker.Row( s.fIRow );
+#ifdef HLTCA_GPUCODE
+               const AliHLTTPCCARow &row = s.fRow;
+#else
+               const AliHLTTPCCARow &row = tracker.Row( s.fIRow );
+#endif
         s.fNHits = row.NHits();
 
         if ( ( s.fIRow >= 2 ) && ( s.fIRow <= s.fNRows - 3 ) ) {
@@ -47,9 +63,14 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread
           s.fIRowDn = s.fIRow - 2;
 
           // references to the rows above and below
+
+#ifdef HLTCA_GPUCODE
+          const AliHLTTPCCARow &rowUp = s.fRowUp;
+          const AliHLTTPCCARow &rowDn = s.fRowDown;
+#else
           const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRowUp );
           const AliHLTTPCCARow &rowDn = tracker.Row( s.fIRowDn );
-
+#endif
           // the axis perpendicular to the rows
           const float xDn = rowDn.X();
           const float x   = row.X();
@@ -67,29 +88,38 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread
           // UpTx/DnTx is used to move the HitArea such that central events are preferred (i.e. vertices
           // coming from y = 0, z = 0).
 
-          s.fGridUp = tracker.Row( s.fIRowUp ).Grid();
-          s.fGridDn = tracker.Row( s.fIRowDn ).Grid();
+          //s.fGridUp = tracker.Row( s.fIRowUp ).Grid();
+          //s.fGridDn = tracker.Row( s.fIRowDn ).Grid();
         }
       }
     }
   } else if ( iSync == 1 ) {
     if ( s.fIRow < s.fNRows ) {
-      if ( ( s.fIRow == 0 ) || ( s.fIRow == s.fNRows - 1 ) || ( s.fIRow == 1 ) || ( s.fIRow == s.fNRows - 2 ) ) {
-        const AliHLTTPCCARow &row = tracker.Row( s.fIRow );
+      if ( ( s.fIRow <= 1 ) || ( s.fIRow >= s.fNRows - 2 ) ) {
+#ifdef HLTCA_GPUCODE
+               const AliHLTTPCCARow &row = s.fRow;
+#else
+               const AliHLTTPCCARow &row = tracker.Row( s.fIRow );
+#endif
         for ( int ih = iThread; ih < s.fNHits; ih += nThreads ) {
           tracker.SetHitLinkUpData( row, ih, -1 );
           tracker.SetHitLinkDownData( row, ih, -1 );
         }
       } else {
-        const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRowUp );
-        const AliHLTTPCCARow &rowDn = tracker.Row( s.fIRowDn );
+/*#ifdef HLTCA_GPUCODE
+          const AliHLTTPCCARow &rowUp = s.fRowUp;
+          const AliHLTTPCCARow &rowDn = s.fRowDown;
+#else
+          const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRowUp );
+          const AliHLTTPCCARow &rowDn = tracker.Row( s.fIRowDn );
+#endif
 
         for ( unsigned int ih = iThread; ih < s.fGridUp.N() + s.fGridUp.Ny() + 2; ih += nThreads ) {
           s.fGridContentUp[ih] = tracker.FirstHitInBin( rowUp, ih );
         }
         for ( unsigned int ih = iThread; ih < s.fGridDn.N() + s.fGridDn.Ny() + 2; ih += nThreads ) {
           s.fGridContentDn[ih] = tracker.FirstHitInBin( rowDn, ih );
-        }
+        }*/
       }
     }
   } else if ( iSync == 2 ) {
@@ -98,20 +128,30 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread
     float chi2Cut = 3.*3.*4 * ( s.fUpDx * s.fUpDx + s.fDnDx * s.fDnDx );
     const float kAreaSize = tracker.Param().NeighboursSearchArea();
     //float chi2Cut = 3.*3.*(s.fUpDx*s.fUpDx + s.fDnDx*s.fDnDx ); //SG
-    const int kMaxN = 20;
-
-    const AliHLTTPCCARow &row = tracker.Row( s.fIRow );
-    const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRowUp );
-    const AliHLTTPCCARow &rowDn = tracker.Row( s.fIRowDn );
+#define kMaxN 20
+
+#ifdef HLTCA_GPUCODE
+                 const AliHLTTPCCARow &row = s.fRow;
+          const AliHLTTPCCARow &rowUp = s.fRowUp;
+          const AliHLTTPCCARow &rowDn = s.fRowDown;
+#else
+                 const AliHLTTPCCARow &row = tracker.Row( s.fIRow );
+                 const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRowUp );
+          const AliHLTTPCCARow &rowDn = tracker.Row( s.fIRowDn );
+#endif
     const float y0 = row.Grid().YMin();
     const float z0 = row.Grid().ZMin();
     const float stepY = row.HstepY();
     const float stepZ = row.HstepZ();
 
-    for ( int ih = iThread; ih < s.fNHits; ih += nThreads ) {
+       for ( int ih = iThread; ih < s.fNHits; ih += nThreads ) {
 
       unsigned short *neighUp = s.fB[iThread];
       float2 *yzUp = s.fA[iThread];
+#if defined(HLTCA_GPUCODE) & kMaxN > ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP
+         unsigned short neighUp2[kMaxN - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP];
+         float2 yzUp2[kMaxN - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP];
+#endif
       //unsigned short neighUp[5];
       //float2 yzUp[5];
 
@@ -123,8 +163,14 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread
         int nNeighUp = 0;
 
         // coordinates of the hit in the current row
+#if defined(HLTCA_GPU_TEXTURE_FETCHa)
+               ushort2 tmpval = tex1Dfetch(gAliTexRefu2, ((char*) tracker.Data().HitData() - tracker.pData()->GPUTextureBase()) / sizeof(ushort2) + row.HitNumberOffset() + ih);
+        const float y = y0 + tmpval.x * stepY;
+        const float z = z0 + tmpval.y * stepZ;
+#else
         const float y = y0 + tracker.HitDataY( row, ih ) * stepY;
         const float z = z0 + tracker.HitDataZ( row, ih ) * stepZ;
+#endif
 
         AliHLTTPCCAHitArea areaDn, areaUp;
         // TODO: for NVIDIA GPUs it should use the GridContentUp/-Dn that got copied into shared mem
@@ -135,8 +181,18 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread
           AliHLTTPCCAHit h;
           int i = areaUp.GetNext( tracker, rowUp, tracker.Data(), &h );
           if ( i < 0 ) break;
-          neighUp[nNeighUp] = ( unsigned short ) i;
-          yzUp[nNeighUp] = CAMath::MakeFloat2( s.fDnDx * ( h.Y() - y ), s.fDnDx * ( h.Z() - z ) );
+#if defined(HLTCA_GPUCODE) & kMaxN > ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP
+                 if (nNeighUp >= ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP)
+                 {
+                       neighUp2[nNeighUp - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP] = ( unsigned short ) i;
+                       yzUp2[nNeighUp - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP] = CAMath::MakeFloat2( s.fDnDx * ( h.Y() - y ), s.fDnDx * ( h.Z() - z ) );
+                 }
+                 else
+#endif
+                 {
+                       neighUp[nNeighUp] = ( unsigned short ) i;
+                       yzUp[nNeighUp] = CAMath::MakeFloat2( s.fDnDx * ( h.Y() - y ), s.fDnDx * ( h.Z() - z ) );
+                 }
           if ( ++nNeighUp >= kMaxN ) break;
         } while ( 1 );
 
@@ -156,7 +212,12 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread
             float2 yzdn = CAMath::MakeFloat2( s.fUpDx * ( h.Y() - y ), s.fUpDx * ( h.Z() - z ) );
 
             for ( int iUp = 0; iUp < nNeighUp; iUp++ ) {
+#if defined(HLTCA_GPUCODE) & kMaxN > ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP
+                         float2 yzup = iUp >= ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP ? yzUp2[iUp - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP] : yzUp[iUp];
+#else
               float2 yzup = yzUp[iUp];
+#endif
+
               float dy = yzdn.x - yzup.x;
               float dz = yzdn.y - yzup.y;
               float d = dy * dy + dz * dz;
@@ -169,7 +230,11 @@ GPUd() void AliHLTTPCCANeighboursFinder::Thread
           } while ( 1 );
 
           if ( bestD <= chi2Cut ) {
+#if defined(HLTCA_GPUCODE) & kMaxN > ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP
+                       linkUp = bestUp >= ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP ? neighUp2[bestUp - ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP] : neighUp[bestUp];
+#else
             linkUp = neighUp[bestUp];
+#endif
             linkDn = bestDn;
           }
         }
index 3dd21de..ff6dbee 100644 (file)
@@ -11,7 +11,7 @@
 
 
 #include "AliHLTTPCCADef.h"
-#include "AliHLTTPCCAGrid.h"
+#include "AliHLTTPCCARow.h"
 class AliHLTTPCCATracker;
 
 /**
@@ -27,18 +27,14 @@ class AliHLTTPCCANeighboursFinder
       public:
 #if !defined(HLTCA_GPUCODE)
         AliHLTTPCCASharedMemory()
-            : fGridUp(), fGridDn(), fNHits( 0 ), fUpNHits( 0 ), fDnNHits( 0 ), fUpDx( 0 ), fDnDx( 0 ), fUpTx( 0 ), fDnTx( 0 ), fIRow( 0 ), fIRowUp( 0 ), fIRowDn( 0 ), fNRows( 0 ) {}
+            : fNHits( 0 ), fUpNHits( 0 ), fDnNHits( 0 ), fUpDx( 0 ), fDnDx( 0 ), fUpTx( 0 ), fDnTx( 0 ), fIRow( 0 ), fIRowUp( 0 ), fIRowDn( 0 ), fNRows( 0 ), fRow(), fRowUp(), fRowDown() {}
 
         AliHLTTPCCASharedMemory( const AliHLTTPCCASharedMemory& /*dummy*/ )
-            : fGridUp(), fGridDn(), fNHits( 0 ), fUpNHits( 0 ), fDnNHits( 0 ), fUpDx( 0 ), fDnDx( 0 ), fUpTx( 0 ), fDnTx( 0 ), fIRow( 0 ), fIRowUp( 0 ), fIRowDn( 0 ), fNRows( 0 ) {}
+            : fNHits( 0 ), fUpNHits( 0 ), fDnNHits( 0 ), fUpDx( 0 ), fDnDx( 0 ), fUpTx( 0 ), fDnTx( 0 ), fIRow( 0 ), fIRowUp( 0 ), fIRowDn( 0 ), fNRows( 0 ), fRow(), fRowUp(), fRowDown() {}
         AliHLTTPCCASharedMemory& operator=( const AliHLTTPCCASharedMemory& /*dummy*/ ) { return *this; }
 #endif
 
-#ifndef CUDA_DEVICE_EMULATION
       protected:
-#endif
-        AliHLTTPCCAGrid fGridUp; // grid for the next row
-        AliHLTTPCCAGrid fGridDn; // grid for the previous row
         int fNHits; // n hits
         int fUpNHits; // n hits in the next row
         int fDnNHits; // n hits in the prev row
@@ -52,8 +48,7 @@ class AliHLTTPCCANeighboursFinder
         int fNRows; // number of rows
         float2 fA[256][ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP]; // temp memory
         unsigned short fB[256][ALIHLTTPCCANEIGHBOURS_FINDER_MAX_NNEIGHUP]; // temp memory
-        unsigned short fGridContentUp[ALIHLTTPCCANEIGHBOURS_FINDER_MAX_FGRIDCONTENTUPDOWN]; // grid content for the next row
-        unsigned short fGridContentDn[ALIHLTTPCCANEIGHBOURS_FINDER_MAX_FGRIDCONTENTUPDOWN];// grid content for the previous row
+               AliHLTTPCCARow fRow, fRowUp, fRowDown;
     };
 
     GPUd() static int NThreadSyncPoints() { return 2; }
index 0c9ec3d..ab58e61 100644 (file)
@@ -125,10 +125,7 @@ class AliHLTTPCCAParam
     GPUd() float GetBz( float x, float y, float z ) const;
     GPUd()  float GetBz( const AliHLTTPCCATrackParam &t ) const;
 
-#ifndef CUDA_DEVICE_EMULATION
   protected:
-#endif
-
     int fISlice; // slice number
     int fNRows; // number of rows
 
index fa16795..c8cbf4c 100644 (file)
@@ -365,7 +365,7 @@ void AliHLTTPCCAPerformance::ReadMCPoint( int TrackID, float X, float Y, float Z
   p.SetTime( Time );
   p.SetISlice( iSlice );
   float sx, sy, sz;
-  AliHLTTPCCAStandaloneFramework::Instance().SliceTracker( iSlice ).Param().Global2Slice( X, Y, Z, &sx, &sy, &sz );
+  AliHLTTPCCAStandaloneFramework::Instance().Param( iSlice ).Global2Slice( X, Y, Z, &sx, &sy, &sz );
   p.SetSx( sx );
   p.SetSy( sy );
   p.SetSz( sz );
@@ -1035,7 +1035,11 @@ void AliHLTTPCCAPerformance::SliceTrackCandPerformance( int /*iSlice*/, bool /*P
       int nla = 0;
 
       for ( int irow = firstRow; irow <= lastRow; irow++ ) {
-        int ih = t.RowHit( irow );
+#ifdef EXTERN_ROW_HITS
+        int ih = slice.TrackletRowHits[iRow * *slice.NTracklets() + itr];
+#else
+               int ih = t.RowHit( irow );
+#endif
         if ( ih < 0 ) continue;
         int index = firstSliceHit + slice.HitInputID( slice.Row( irow ), ih );
         AliHLTTPCCAHitLabel &l = fHitLabels[fTracker->Hits()[index].ID()];
@@ -1065,7 +1069,11 @@ void AliHLTTPCCAPerformance::SliceTrackCandPerformance( int /*iSlice*/, bool /*P
       }
       lmax = 0;
       for ( int irow = firstRow; irow <= lastRow; irow++ ) {
-        int ih = t.RowHit( irow );
+#ifdef EXTERN_ROW_HITS
+        int ih = slice.TrackletRowHits[iRow * *slice.NTracklets() + itr];
+#else
+               int ih = t.RowHit( irow );
+#endif
         if ( ih < 0 ) continue;
         int index = firstSliceHit + slice.HitInputID( slice.Row( irow ), ih );
         AliHLTTPCCAHitLabel &l = fHitLabels[fTracker->Hits()[index].ID()];
@@ -1178,15 +1186,16 @@ void AliHLTTPCCAPerformance::SlicePerformance( int iSlice, bool PrintFlag )
   int nRecTot = 0, nGhost = 0, nRecOut = 0;
   int nMCAll = 0, nRecAll = 0, nClonesAll = 0;
   int nMCRef = 0, nRecRef = 0, nClonesRef = 0;
-  const AliHLTTPCCATracker &tracker = hlt.SliceTracker( iSlice );
+  //const AliHLTTPCCATracker &tracker = hlt.SliceTracker( iSlice );
+  const AliHLTTPCCAClusterData &clusterdata = hlt.ClusterData(iSlice);
 
   // Select reconstructable MC tracks
 
   {
     for ( int imc = 0; imc < fNMCTracks; imc++ ) fMCTracks[imc].SetNHits( 0 );
 
-    for ( int ih = 0; ih < tracker.ClusterData()->NumberOfClusters(); ih++ ) {
-      int id = tracker.ClusterData()->Id( ih );
+    for ( int ih = 0; ih < clusterdata.NumberOfClusters(); ih++ ) {
+      int id = clusterdata.Id( ih );
       if ( id < 0 || id > fNHits ) break;
       AliHLTTPCCAHitLabel &l = fHitLabels[id];
       if ( l.fLab[0] >= 0 ) fMCTracks[l.fLab[0]].SetNHits( fMCTracks[l.fLab[0]].NHits() + 1 );
@@ -1210,9 +1219,9 @@ void AliHLTTPCCAPerformance::SlicePerformance( int iSlice, bool PrintFlag )
     }
   }
 
-  if ( !tracker.Output() ) return;
+  //if ( !tracker.Output() ) return;
 
-  const AliHLTTPCCASliceOutput &output = *tracker.Output();
+  const AliHLTTPCCASliceOutput &output = hlt.Output(iSlice);
 
   int traN = output.NTracks();
 
@@ -1618,7 +1627,8 @@ void AliHLTTPCCAPerformance::ClusterPerformance()
         t.SetSinPhi( dy * s );
         t.SetSignCosPhi( dx );
         t.SetDzDs( dz * s );
-        hlt.SliceTracker( 0 ).GetErrors2( data.RowNumber( ic ), t, errY, errZ );
+        //hlt.SliceTracker( 0 ).GetErrors2( data.RowNumber( ic ), t, errY, errZ );
+               hlt.Param(0).GetClusterErrors2( data.RowNumber( ic ), t.GetZ(), t.SinPhi(), t.GetCosPhi(), t.DzDs(), errY, errZ );
         errY = TMath::Sqrt( errY );
         errZ = TMath::Sqrt( errZ );
       }
@@ -1737,7 +1747,8 @@ void AliHLTTPCCAPerformance::SmearClustersMC()
         t.SetSinPhi( dy * s );
         t.SetSignCosPhi( dx );
         t.SetDzDs( dz * s );
-        hlt.SliceTracker( 0 ).GetErrors2( row0, t, errY, errZ );
+        //hlt.SliceTracker( 0 ).GetErrors2( row0, t, errY, errZ );
+               hlt.Param(0).GetClusterErrors2( row0, t.GetZ(), t.SinPhi(), t.GetCosPhi(), t.DzDs(), errY, errZ );
         errY = TMath::Sqrt( errY );
         errZ = TMath::Sqrt( errZ );
       }
index 3d71214..8b0c245 100644 (file)
@@ -23,30 +23,31 @@ class AliHLTTPCCATracker;
 #if defined(HLTCA_GPUCODE)
 
 template<class TProcess>
-GPUg() void AliHLTTPCCAProcess()
+GPUg() void AliHLTTPCCAProcess(int iSlice)
 {
-  AliHLTTPCCATracker &tracker = *( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker );
-
+  AliHLTTPCCATracker &tracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[iSlice];
   GPUshared() typename TProcess::AliHLTTPCCASharedMemory smem;
 
-  TProcess::Thread( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, 0, smem, tracker  );
-
-#define GPUPROCESS(iSync) \
-  if( TProcess::NThreadSyncPoints()>=iSync ){ \
-    GPUsync(); \
-    TProcess::Thread( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, iSync, smem, tracker  ); \
+  for( int iSync=0; iSync<=TProcess::NThreadSyncPoints(); iSync++){
+    __syncthreads();
+    TProcess::Thread( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, iSync, smem, tracker  );
   }
+}
 
-  GPUPROCESS( 1 )
-  GPUPROCESS( 2 )
-  GPUPROCESS( 3 )
-
-  //for( int iSync=0; iSync<=TProcess::NThreadSyncPoints(); iSync++){
-  //__syncthreads();
-  //TProcess::ThreadGPU( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, iSync, smem, tracker  );
-  //}
+template<class TProcess>
+GPUg() void AliHLTTPCCAProcessMulti(int firstSlice, int nSliceCount)
+{
+  const int iSlice = nSliceCount * (blockIdx.x + (gridDim.x % nSliceCount != 0 && nSliceCount * (blockIdx.x + 1) % gridDim.x != 0)) / gridDim.x;
+  const int nSliceBlockOffset = gridDim.x * iSlice / nSliceCount;
+  const int sliceBlockId = blockIdx.x - nSliceBlockOffset;
+  const int sliceGridDim = gridDim.x * (iSlice + 1) / nSliceCount - gridDim.x * (iSlice) / nSliceCount;
+  AliHLTTPCCATracker &tracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[firstSlice + iSlice];
+  GPUshared() typename TProcess::AliHLTTPCCASharedMemory smem;
 
-#undef GPUPROCESS
+  for( int iSync=0; iSync<=TProcess::NThreadSyncPoints(); iSync++){
+    __syncthreads();
+    TProcess::Thread( sliceGridDim, blockDim.x, sliceBlockId, threadIdx.x, iSync, smem, tracker  );
+  }
 }
 
 #else
index a809048..d04a579 100644 (file)
@@ -44,10 +44,7 @@ class AliHLTTPCCARow
     GPUhd() int   HitNumberOffset() const { return fHitNumberOffset; }
     GPUhd() unsigned int FirstHitInBinOffset() const { return fFirstHitInBinOffset; }
 
-#ifndef CUDA_DEVICE_EMULATION
   private:
-#endif
-
     int fNHits;            // number of hits
     float fX;              // X coordinate of the row
     float fMaxY;           // maximal Y coordinate of the row
index 3a50b99..5dd8e67 100644 (file)
@@ -21,6 +21,8 @@
 #include "AliHLTTPCCAHit.h"
 #include "AliHLTTPCCAParam.h"
 #include "MemoryAssignmentHelpers.h"
+#include "AliHLTTPCCAGPUConfig.h"
+#include "AliHLTTPCCAGPUTracker.h"
 #include <iostream>
 
 // calculates an approximation for 1/sqrt(x)
@@ -36,7 +38,7 @@ static inline float fastInvSqrt( float _x )
   return x.f;
 }
 
-inline void AliHLTTPCCASliceData::CreateGrid( AliHLTTPCCARow *row, const AliHLTTPCCAClusterData &data )
+inline void AliHLTTPCCASliceData::CreateGrid( AliHLTTPCCARow *row, const AliHLTTPCCAClusterData &data, int ClusterDataHitNumberOffset )
 {
   // grid creation
 
@@ -50,7 +52,7 @@ inline void AliHLTTPCCASliceData::CreateGrid( AliHLTTPCCARow *row, const AliHLTT
   float yMax = -1.e3f;
   float zMin =  1.e3f;
   float zMax = -1.e3f;
-  for ( int i = row->fHitNumberOffset; i < row->fHitNumberOffset + row->fNHits; ++i ) {
+  for ( int i = ClusterDataHitNumberOffset; i < ClusterDataHitNumberOffset + row->fNHits; ++i ) {
     const float y = data.Y( i );
     const float z = data.Z( i );
     if ( yMax < y ) yMax = y;
@@ -94,8 +96,8 @@ inline void AliHLTTPCCASliceData::PackHitData( AliHLTTPCCARow *row, const AliHLT
       std::cout << "!!!! hit packing error!!! " << xx << " " << yy << " " << std::endl;
     }
     // HitData is bin sorted
-    fHitDataY[row->fHitNumberOffset + hitIndex] = xx;
-    fHitDataZ[row->fHitNumberOffset + hitIndex] = yy;
+    fHitData[row->fHitNumberOffset + hitIndex].x = xx;
+    fHitData[row->fHitNumberOffset + hitIndex].y = yy;
   }
 }
 
@@ -107,49 +109,86 @@ void AliHLTTPCCASliceData::Clear()
 void AliHLTTPCCASliceData::InitializeRows( const AliHLTTPCCAParam &p )
 {
   // initialisation of rows
-
+       if (!fRows) fRows = new AliHLTTPCCARow[HLTCA_ROW_COUNT + 1];
   for ( int i = 0; i < p.NRows(); ++i ) {
     fRows[i].fX = p.RowX( i );
     fRows[i].fMaxY = CAMath::Tan( p.DAlpha() / 2. ) * fRows[i].fX;
   }
 }
 
-GPUh() char* AliHLTTPCCASliceData::SetGPUSliceDataMemory(char* pGPUMemory, const AliHLTTPCCAClusterData *data)
+#ifndef HLTCA_GPUCODE
+       AliHLTTPCCASliceData::~AliHLTTPCCASliceData()
+       {
+               //Standard Destrcutor
+               if (fRows)
+               {
+                       if (!fIsGpuSliceData) delete[] fRows;
+                       fRows = NULL;
+               }
+               if (fMemory)
+               {
+                       if (!fIsGpuSliceData) delete[] fMemory;
+                       fMemory = NULL;
+               }
+
+       }
+#endif
+
+GPUh() void AliHLTTPCCASliceData::SetGPUSliceDataMemory(void* const pSliceMemory, void* const pRowMemory)
 {
-       fMemory = (char*) pGPUMemory;
-       return(pGPUMemory + SetPointers(data, false));
+       //Set Pointer to slice data memory to external memory
+       fMemory = (char*) pSliceMemory;
+       fRows = (AliHLTTPCCARow*) pRowMemory;
 }
 
 size_t AliHLTTPCCASliceData::SetPointers(const AliHLTTPCCAClusterData *data, bool allocate)
 {
-  const int numberOfRows = data->LastRow() - data->FirstRow();
-  enum { kVectorAlignment = sizeof( int ) };
-  const int numberOfHitsPlusAlignment = NextMultipleOf < kVectorAlignment / sizeof( int ) > ( fNumberOfHits );
+       //Set slice data internal pointers
+  int hitMemCount = 0;
+  for ( int rowIndex = data->FirstRow(); rowIndex <= data->LastRow(); ++rowIndex )
+  {
+       hitMemCount += NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(data->NumberOfClusters( rowIndex ));
+  }
+       //Calculate Memory needed to store hits in rows
+
+  const int numberOfRows = data->LastRow() - data->FirstRow() + 1;
+  enum { kVectorAlignment = 256 /*sizeof( uint4 )*/ };
+  fNumberOfHitsPlusAlign = NextMultipleOf < (kVectorAlignment > sizeof(HLTCA_GPU_ROWALIGNMENT) ? kVectorAlignment : sizeof(HLTCA_GPU_ROWALIGNMENT)) / sizeof( int ) > ( hitMemCount );
+  fNumberOfHits = data->NumberOfClusters();
+  const int firstHitInBinSize = (23 + sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(int)) * numberOfRows + 4 * fNumberOfHits + 3;
+  //FIXME: sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(int) * numberOfRows is way to big and only to ensure to reserve enough memory for GPU Alignment.
+  //Might be replaced by correct value
+
   const int memorySize =
     // LinkData, HitData
-    numberOfHitsPlusAlignment * 4 * sizeof( short ) +
+    fNumberOfHitsPlusAlign * 4 * sizeof( short ) +
     // FirstHitInBin
-    NextMultipleOf<kVectorAlignment>( ( 23 * numberOfRows + 4 * fNumberOfHits + 3 ) * sizeof( int ) ) +
+    NextMultipleOf<kVectorAlignment>( ( firstHitInBinSize ) * sizeof( int ) ) +
     // HitWeights, ClusterDataIndex
-    numberOfHitsPlusAlignment * 2 * sizeof( int );
+    fNumberOfHitsPlusAlign * 2 * sizeof( int );
 
   if ( fMemorySize < memorySize ) {
        fMemorySize = memorySize;
-       if (allocate)
+       if (allocate && !fIsGpuSliceData)
        {
-         delete[] fMemory;
+               if (fMemory)
+               {
+                       delete[] fMemory;
+               }
          fMemory = new char[fMemorySize + 4];// kVectorAlignment];
        }
   }
 
   char *mem = fMemory;
-  AssignMemory( fLinkUpData,   mem, numberOfHitsPlusAlignment );
-  AssignMemory( fLinkDownData, mem, numberOfHitsPlusAlignment );
-  AssignMemory( fHitDataY,     mem, numberOfHitsPlusAlignment );
-  AssignMemory( fHitDataZ,     mem, numberOfHitsPlusAlignment );
-  AssignMemory( fFirstHitInBin,  mem, 23 * numberOfRows + 4 * fNumberOfHits + 3 );
-  AssignMemory( fHitWeights,   mem, numberOfHitsPlusAlignment );
-  AssignMemory( fClusterDataIndex, mem, numberOfHitsPlusAlignment );
+  AssignMemory( fLinkUpData,   mem, fNumberOfHitsPlusAlign );
+  AssignMemory( fLinkDownData, mem, fNumberOfHitsPlusAlign );
+  AssignMemory( fHitData,     mem, fNumberOfHitsPlusAlign );
+  AssignMemory( fFirstHitInBin,  mem, firstHitInBinSize );
+  fGpuMemorySize = mem - fMemory;
+
+  //Memory Allocated below will not be copied to GPU but instead be initialized on the gpu itself. Therefore it must not be copied to GPU!
+  AssignMemory( fHitWeights,   mem, fNumberOfHitsPlusAlign );
+  AssignMemory( fClusterDataIndex, mem, fNumberOfHitsPlusAlign );
   return(mem - fMemory);
 }
 
@@ -161,9 +200,7 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da
   // 1. prepare arrays
   ////////////////////////////////////
 
-#ifdef DEBUG
-  const int numberOfRows = data.LastRow() - data.FirstRow();
-#endif //DEBUG
+  const int numberOfRows = data.LastRow() - data.FirstRow() + 1;
   fNumberOfHits = data.NumberOfClusters();
 
   /* TODO Vectorization
@@ -193,7 +230,7 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da
     row.fHstepYi = 1.f;
     row.fHstepZi = 1.f;
   }
-  for ( int rowIndex = data.LastRow() + 1; rowIndex < 160; ++rowIndex ) {
+  for ( int rowIndex = data.LastRow() + 1; rowIndex < HLTCA_ROW_COUNT + 1; ++rowIndex ) {
     AliHLTTPCCARow &row = fRows[rowIndex];
     row.fGrid.CreateEmpty();
     row.fNHits = 0;
@@ -210,26 +247,31 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da
   }
 
 
-  AliHLTResizableArray<AliHLTTPCCAHit> binSortedHits( fNumberOfHits );
+  AliHLTResizableArray<AliHLTTPCCAHit> binSortedHits( fNumberOfHits + sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v) * numberOfRows + 1 );
 
   int gridContentOffset = 0;
+  int hitOffset = 0;
 
   int binCreationMemorySize = 103 * 2 + fNumberOfHits;
   AliHLTResizableArray<unsigned short> binCreationMemory( binCreationMemorySize );
 
+  fGPUSharedDataReq = 0;
+
   for ( int rowIndex = data.FirstRow(); rowIndex <= data.LastRow(); ++rowIndex ) {
     AliHLTTPCCARow &row = fRows[rowIndex];
     row.fNHits = data.NumberOfClusters( rowIndex );
     assert( row.fNHits < ( 1 << sizeof( unsigned short ) * 8 ) );
-    row.fHitNumberOffset = data.RowOffset( rowIndex );
+       row.fHitNumberOffset = hitOffset;
+       hitOffset += NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(data.NumberOfClusters( rowIndex ));
+
     row.fFirstHitInBinOffset = gridContentOffset;
 
-    CreateGrid( &row, data );
+    CreateGrid( &row, data, data.RowOffset( rowIndex ) );
     const AliHLTTPCCAGrid &grid = row.fGrid;
     const int numberOfBins = grid.N();
 
     int binCreationMemorySizeNew;
-    if ( ( binCreationMemorySizeNew = numberOfBins * 2 + 6 + row.fNHits ) > binCreationMemorySize ) {
+    if ( ( binCreationMemorySizeNew = numberOfBins * 2 + 6 + row.fNHits + sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(unsigned short) * numberOfRows + 1) > binCreationMemorySize ) {
       binCreationMemorySize = binCreationMemorySizeNew;
       binCreationMemory.Resize( binCreationMemorySize );
     }
@@ -243,8 +285,9 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da
     }
 
     for ( int hitIndex = 0; hitIndex < row.fNHits; ++hitIndex ) {
-      const int globalHitIndex = row.fHitNumberOffset + hitIndex;
+      const int globalHitIndex = data.RowOffset( rowIndex ) + hitIndex;
       const unsigned short bin = row.fGrid.GetBin( data.Y( globalHitIndex ), data.Z( globalHitIndex ) );
+
       bins[hitIndex] = bin;
       ++filled[bin];
     }
@@ -260,7 +303,7 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da
       --filled[bin];
       const unsigned short ind = c[bin] + filled[bin]; // generate an index for this hit that is >= c[bin] and < c[bin + 1]
       const int globalBinsortedIndex = row.fHitNumberOffset + ind;
-      const int globalHitIndex = row.fHitNumberOffset + hitIndex;
+      const int globalHitIndex = data.RowOffset( rowIndex ) + hitIndex;
 
       // allows to find the global hit index / coordinates from a global bin sorted hit index
       fClusterDataIndex[globalBinsortedIndex] = globalHitIndex;
@@ -277,12 +320,18 @@ void AliHLTTPCCASliceData::InitFromClusterData( const AliHLTTPCCAClusterData &da
     // grid.N is <= row.fNHits
     const int nn = numberOfBins + grid.Ny() + 3;
     for ( int i = numberOfBins; i < nn; ++i ) {
-      assert( row.fFirstHitInBinOffset + i < 23 * numberOfRows + 4 * fNumberOfHits + 3 );
+      assert( (signed) row.fFirstHitInBinOffset + i < 23 * numberOfRows + 4 * fNumberOfHits + 3 );
       fFirstHitInBin[row.fFirstHitInBinOffset + i] = a;
     }
 
     row.fFullSize = nn;
     gridContentOffset += nn;
+
+       if (NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.fNHits) + nn > (unsigned) fGPUSharedDataReq)
+               fGPUSharedDataReq = NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.fNHits) + nn;
+
+       //Make pointer aligned
+       gridContentOffset = NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(gridContentOffset);
   }
 
 #if 0
@@ -351,7 +400,7 @@ void AliHLTTPCCASliceData::ClearHitWeights()
     v0.store( mem );
   }
 #else
-  for ( int i = 0; i < fNumberOfHits; ++i ) {
+  for ( int i = 0; i < fNumberOfHitsPlusAlign; ++i ) {
     fHitWeights[i] = 0;
   }
 #endif
index ebdd8c6..c6fee6e 100644 (file)
@@ -21,6 +21,7 @@
 #include "AliHLTTPCCARow.h"
 #include "AliHLTTPCCAMath.h"
 #include "AliHLTArray.h"
+#include "AliHLTTPCCAGPUConfig.h"
 
 typedef int int_v;
 typedef unsigned int uint_v;
@@ -41,12 +42,18 @@ class AliHLTTPCCAParam;
  */
 class AliHLTTPCCASliceData
 {
-       friend class AliHLTTPCCAGPUTracker;
   public:
     AliHLTTPCCASliceData()
-        : fNumberOfHits( 0 ), fMemorySize( 0 ), fMemory( 0 ), fLinkUpData( 0 ),
-        fLinkDownData( 0 ), fHitDataY( 0 ), fHitDataZ( 0 ), fClusterDataIndex( 0 ),
-        fFirstHitInBin( 0 ), fHitWeights( 0 ) {}
+        : 
+               fIsGpuSliceData(0), fGPUSharedDataReq(0), fNumberOfHits( 0 ), fNumberOfHitsPlusAlign( 0 ), fMemorySize( 0 ), fGpuMemorySize( 0 ), fMemory( 0 ), fGPUTextureBase( 0 )
+               ,fRows( NULL ), fLinkUpData( 0 ), fLinkDownData( 0 ), fHitData( 0 ), fClusterDataIndex( 0 )
+        , fFirstHitInBin( 0 ), fHitWeights( 0 )
+       {
+       }
+
+#ifndef HLTCA_GPUCODE
+       ~AliHLTTPCCASliceData();
+#endif
 
     void InitializeRows( const AliHLTTPCCAParam &parameters );
 
@@ -55,7 +62,7 @@ class AliHLTTPCCASliceData
      * data.
      */
 
-       char* SetGPUSliceDataMemory(char* pGPUMemory, const AliHLTTPCCAClusterData *data);
+       void SetGPUSliceDataMemory(void* const pSliceMemory, void* const pRowMemory);
        size_t SetPointers(const AliHLTTPCCAClusterData *data, bool allocate = false);
     void InitFromClusterData( const AliHLTTPCCAClusterData &data );
 
@@ -68,6 +75,7 @@ class AliHLTTPCCASliceData
      * Return the number of hits in this slice.
      */
     GPUhd() int NumberOfHits() const { return fNumberOfHits; }
+       GPUhd() int NumberOfHitsPlusAlign() const { return fNumberOfHitsPlusAlign; }
 
     /**
      * Access to the hit links.
@@ -76,6 +84,13 @@ class AliHLTTPCCASliceData
      */
     short_v HitLinkUpData  ( const AliHLTTPCCARow &row, const short_v &hitIndex ) const;
     short_v HitLinkDownData( const AliHLTTPCCARow &row, const short_v &hitIndex ) const;
+    
+       GPUhd() const ushort2 *HitData( const AliHLTTPCCARow &row ) const;
+       GPUhd() const ushort2 *HitData() const { return(fHitData); }
+    GPUd() const short_v *HitLinkUpData  ( const AliHLTTPCCARow &row ) const;
+    GPUd() const short_v *HitLinkDownData( const AliHLTTPCCARow &row ) const;
+    GPUd() const ushort_v *FirstHitInBin( const AliHLTTPCCARow &row ) const;
+
     void SetHitLinkUpData  ( const AliHLTTPCCARow &row, const short_v &hitIndex,
                              const short_v &value );
     void SetHitLinkDownData( const AliHLTTPCCARow &row, const short_v &hitIndex,
@@ -90,8 +105,9 @@ class AliHLTTPCCASliceData
      * Return the y and z coordinate(s) of the given hit(s).
      */
     // TODO return float_v
-    short_v HitDataY( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const;
-    short_v HitDataZ( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const;
+    ushort_v HitDataY( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const;
+    ushort_v HitDataZ( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const;
+       ushort2 HitData( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const;
 
     /**
      * For a given bin index, content tells how many hits there are in the preceding bins. This maps
@@ -125,34 +141,53 @@ class AliHLTTPCCASliceData
      * Return the row object for the given row index.
      */
     const AliHLTTPCCARow &Row( int rowIndex ) const;
+       GPUhd() AliHLTTPCCARow* Rows() const {return fRows;}
 
-#ifndef CUDA_DEVICE_EMULATION
-  private:
-#endif
+       GPUh() char *Memory() const {return(fMemory); }
+       GPUh() size_t MemorySize() const {return(fMemorySize); }
+       GPUh() size_t GpuMemorySize() const {return(fGpuMemorySize); }
+       GPUh() int* HitWeights() const {return(fHitWeights); }
 
-    AliHLTTPCCASliceData( const AliHLTTPCCASliceData & )
-        : fNumberOfHits( 0 ), fMemorySize( 0 ), fMemory( 0 ), fLinkUpData( 0 ),
-        fLinkDownData( 0 ), fHitDataY( 0 ), fHitDataZ( 0 ), fClusterDataIndex( 0 ),
-        fFirstHitInBin( 0 ), fHitWeights( 0 ) {}
+       GPUhd() void SetGPUTextureBase(char* val) {fGPUTextureBase = val;}
+       GPUhd() char* GPUTextureBase() const { return(fGPUTextureBase); }
+       GPUhd() char* GPUTextureBaseConst() const { return(fGPUTextureBase); }
+
+       GPUh() int GPUSharedDataReq() const { return fGPUSharedDataReq; }
+
+       void SetGpuSliceData() { fIsGpuSliceData = 1; }
 
+  private:
+    AliHLTTPCCASliceData( const AliHLTTPCCASliceData & )
+        : 
+               fIsGpuSliceData(0), fGPUSharedDataReq(0), fNumberOfHits( 0 ), fNumberOfHitsPlusAlign( 0 ), fMemorySize( 0 ), fGpuMemorySize( 0 ), fMemory( 0 ), fGPUTextureBase( 0 )
+               ,fRows( NULL ), fLinkUpData( 0 ), fLinkDownData( 0 ), fHitData( 0 ), fClusterDataIndex( 0 )
+        , fFirstHitInBin( 0 ), fHitWeights( 0 )
+       {
+       }
     AliHLTTPCCASliceData& operator=( const AliHLTTPCCASliceData & ) {
       return *this;
     }
 
-    void CreateGrid( AliHLTTPCCARow *row, const AliHLTTPCCAClusterData &data );
+    void CreateGrid( AliHLTTPCCARow *row, const AliHLTTPCCAClusterData &data, int ClusterDataHitNumberOffset );
     void PackHitData( AliHLTTPCCARow *row, const AliHLTArray<AliHLTTPCCAHit, 1> &binSortedHits );
 
-    AliHLTTPCCARow fRows[200]; // The row objects needed for most accessor functions
+       int fIsGpuSliceData;            //Slice Data for GPU Tracker?
+       int fGPUSharedDataReq;          //Size of shared memory required for GPU Reconstruction
 
     int fNumberOfHits;         // the number of hits in this slice
+       int fNumberOfHitsPlusAlign;
+
     int fMemorySize;           // size of the allocated memory in bytes
+       int fGpuMemorySize;                // size of Memory needed to be transfered to GPU
     char *fMemory;             // pointer to the allocated memory where all the following arrays reside in
+       char *fGPUTextureBase;          // pointer to start of GPU texture
 
-    short *fLinkUpData;        // hit index in the row above which is linked to the given (global) hit index
+    AliHLTTPCCARow *fRows; // The row objects needed for most accessor functions
+
+       short *fLinkUpData;        // hit index in the row above which is linked to the given (global) hit index
     short *fLinkDownData;      // hit index in the row below which is linked to the given (global) hit index
 
-    unsigned short *fHitDataY;         // packed y coordinate of the given (global) hit index
-    unsigned short *fHitDataZ;         // packed z coordinate of the given (global) hit index
+    ushort2 *fHitData;         // packed y,z coordinate of the given (global) hit index
 
     int *fClusterDataIndex;    // see ClusterDataIndex()
 
@@ -176,6 +211,26 @@ GPUd() inline short_v AliHLTTPCCASliceData::HitLinkDownData( const AliHLTTPCCARo
   return fLinkDownData[row.fHitNumberOffset + hitIndex];
 }
 
+GPUd() inline const ushort_v *AliHLTTPCCASliceData::FirstHitInBin( const AliHLTTPCCARow &row ) const
+{
+  return &fFirstHitInBin[row.fFirstHitInBinOffset];
+}
+
+GPUd() inline const short_v *AliHLTTPCCASliceData::HitLinkUpData  ( const AliHLTTPCCARow &row ) const
+{
+  return &fLinkUpData[row.fHitNumberOffset];
+}
+
+GPUd() inline const short_v *AliHLTTPCCASliceData::HitLinkDownData( const AliHLTTPCCARow &row ) const
+{
+  return &fLinkDownData[row.fHitNumberOffset];
+}
+
+GPUhd() inline const ushort2 *AliHLTTPCCASliceData::HitData( const AliHLTTPCCARow &row ) const
+{
+  return &fHitData[row.fHitNumberOffset];
+}
+
 GPUd() inline void AliHLTTPCCASliceData::SetHitLinkUpData  ( const AliHLTTPCCARow &row, const short_v &hitIndex, const short_v &value )
 {
   fLinkUpData[row.fHitNumberOffset + hitIndex] = value;
@@ -186,14 +241,19 @@ GPUd() inline void AliHLTTPCCASliceData::SetHitLinkDownData( const AliHLTTPCCARo
   fLinkDownData[row.fHitNumberOffset + hitIndex] = value;
 }
 
-GPUd() inline short_v AliHLTTPCCASliceData::HitDataY( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const
+GPUd() inline ushort_v AliHLTTPCCASliceData::HitDataY( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const
+{
+  return fHitData[row.fHitNumberOffset + hitIndex].x;
+}
+
+GPUd() inline ushort_v AliHLTTPCCASliceData::HitDataZ( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const
 {
-  return fHitDataY[row.fHitNumberOffset + hitIndex];
+  return fHitData[row.fHitNumberOffset + hitIndex].y;
 }
 
-GPUd() inline short_v AliHLTTPCCASliceData::HitDataZ( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const
+GPUd() inline ushort2 AliHLTTPCCASliceData::HitData( const AliHLTTPCCARow &row, const uint_v &hitIndex ) const
 {
-  return fHitDataZ[row.fHitNumberOffset + hitIndex];
+  return fHitData[row.fHitNumberOffset + hitIndex];
 }
 
 GPUd() inline ushort_v AliHLTTPCCASliceData::FirstHitInBin( const AliHLTTPCCARow &row, ushort_v binIndexes ) const
index 842661d..59d7c69 100644 (file)
@@ -18,6 +18,8 @@
 //***************************************************************************
 
 #include "AliHLTTPCCASliceOutput.h"
+#include "MemoryAssignmentHelpers.h"
+
 
 GPUhd() int AliHLTTPCCASliceOutput::EstimateSize( int nOfTracks, int nOfTrackClusters )
 {
@@ -29,6 +31,18 @@ GPUhd() int AliHLTTPCCASliceOutput::EstimateSize( int nOfTracks, int nOfTrackClu
 }
 
 #ifndef HLTCA_GPUCODE
+
+void AliHLTTPCCASliceOutput::Clear()
+{
+       //Clear Slice Output and free Memory
+       if (fMemory) delete[] fMemory;
+       fMemory = NULL;
+       fNOutTracks = 0;
+       fNOutTrackHits = 0;
+       fNTracks = 0;
+       fNTrackClusters = 0;
+}
+
 template<typename T> inline void AssignNoAlignment( T *&dst, char *&mem, int count )
 {
   // assign memory to the pointer dst
@@ -36,11 +50,11 @@ template<typename T> inline void AssignNoAlignment( T *&dst, char *&mem, int cou
   mem = ( char * )( dst + count );
 }
 
-GPUhd() void AliHLTTPCCASliceOutput::SetPointers()
+void AliHLTTPCCASliceOutput::SetPointers()
 {
   // set all pointers
 
-  char *mem = &fMemory[0];
+  char *mem = fMemory;
   AssignNoAlignment( fTracks,            mem, fNTracks );
   AssignNoAlignment( fClusterUnpackedYZ, mem, fNTrackClusters );
   AssignNoAlignment( fClusterUnpackedX,  mem, fNTrackClusters );
@@ -49,5 +63,24 @@ GPUhd() void AliHLTTPCCASliceOutput::SetPointers()
   AssignNoAlignment( fClusterRow,        mem, fNTrackClusters );
   AssignNoAlignment( fClusterPackedAmp,  mem, fNTrackClusters );
 
+  // memory for output tracks
+
+  AssignMemory( fOutTracks, mem, fNTracks );
+
+  // arrays for track hits
+
+  AssignMemory( fOutTrackHits, mem, fNTrackClusters );
+
+
+  fMemorySize = (mem - fMemory);
+}
+
+void AliHLTTPCCASliceOutput::Allocate()
+{
+       //Allocate All memory needed for slice output
+  if (fMemory) delete[] fMemory;
+  SetPointers(); // to calculate the size
+  fMemory = reinterpret_cast<char*> ( new uint4 [ fMemorySize/sizeof( uint4 ) + 100] );
+  SetPointers(); // set pointers
 }
 #endif
index d1e83fc..f20503b 100644 (file)
 #define ALIHLTTPCCASLICEOUTPUT_H
 
 #include "AliHLTTPCCADef.h"
-
+#include <cstdlib>
 #include "AliHLTTPCCASliceTrack.h"
+//Obsolete
+#include "AliHLTTPCCAOutTrack.h"
 
 /**
  * @class AliHLTTPCCASliceOutput
@@ -29,6 +31,15 @@ class AliHLTTPCCASliceOutput
 {
   public:
 
+    AliHLTTPCCASliceOutput()
+        : fNTracks( 0 ), fNTrackClusters( 0 ), fTracks( 0 ),  fClusterId( 0 ), fClusterRow( 0 ), fClusterPackedYZ( 0 ), fClusterUnpackedYZ( 0 ), fClusterUnpackedX( 0 ), fClusterPackedAmp( 0 ),
+               fMemorySize( 0 ), fNOutTracks(0), fNOutTrackHits(0), fOutTracks(0), fOutTrackHits(0), fMemory(NULL) {}
+
+       ~AliHLTTPCCASliceOutput()
+       {
+               if (fMemory) delete[] fMemory;
+       }
+
     GPUhd() int NTracks()                    const { return fNTracks;              }
     GPUhd() int NTrackClusters()             const { return fNTrackClusters;       }
 
@@ -41,9 +52,8 @@ class AliHLTTPCCASliceOutput
     GPUhd() float    ClusterUnpackedX  ( int i )  const { return fClusterUnpackedX[i]; }
 
     GPUhd() static int EstimateSize( int nOfTracks, int nOfTrackClusters );
-#ifndef HLTCA_GPUCODE
-    GPUhd() void SetPointers();
-#endif
+    void SetPointers();
+       void Allocate();
 
     GPUhd() void SetNTracks       ( int v )  { fNTracks = v;        }
     GPUhd() void SetNTrackClusters( int v )  { fNTrackClusters = v; }
@@ -56,12 +66,26 @@ class AliHLTTPCCASliceOutput
     GPUhd() void SetClusterUnpackedYZ( int i, float2 v ) {  fClusterUnpackedYZ[i] = v; }
     GPUhd() void SetClusterUnpackedX( int i, float v ) {  fClusterUnpackedX[i] = v; }
 
-  private:
+       char* Memory() const { return(fMemory); }
+       size_t MemorySize() const { return(fMemorySize); }
+
+       void Clear();
+
+       //Obsolete Output
 
-    AliHLTTPCCASliceOutput( const AliHLTTPCCASliceOutput& )
-        : fNTracks( 0 ), fNTrackClusters( 0 ), fTracks( 0 ),  fClusterId( 0 ), fClusterRow( 0 ), fClusterPackedYZ( 0 ), fClusterUnpackedYZ( 0 ), fClusterUnpackedX( 0 ), fClusterPackedAmp( 0 ) {}
+    GPUhd()  int NOutTracks() const { return(fNOutTracks); }
+       GPUhd()  void SetNOutTracks(int val) { fNOutTracks = val; }
+    GPUhd()  AliHLTTPCCAOutTrack *OutTracks() const { return  fOutTracks; }
+    GPUhd()  const AliHLTTPCCAOutTrack &OutTrack( int index ) const { return fOutTracks[index]; }
+    GPUhd()  int NOutTrackHits() const { return  fNOutTrackHits; }
+       GPUhd()  void SetNOutTrackHits(int val) { fNOutTrackHits = val; }
+    GPUhd()  void SetOutTrackHit(int n, int val) { fOutTrackHits[n] = val; }
+    GPUhd()  int OutTrackHit( int i ) const { return  fOutTrackHits[i]; }
+
+  private:
 
     const AliHLTTPCCASliceOutput& operator=( const AliHLTTPCCASliceOutput& ) const { return *this; }
+    AliHLTTPCCASliceOutput( const AliHLTTPCCASliceOutput& );
 
     int fNTracks;                   // number of reconstructed tracks
     int fNTrackClusters;            // total number of track clusters
@@ -72,7 +96,18 @@ class AliHLTTPCCASliceOutput
     float2   *fClusterUnpackedYZ;    // pointer to cluster coordinates (temporary data, for debug proposes)
     float    *fClusterUnpackedX;     // pointer to cluster coordinates (temporary data, for debug proposes)
     UChar_t  *fClusterPackedAmp;     // pointer to packed cluster amplitudes
-    char fMemory[1]; // the memory where the pointers above point into
+       size_t fMemorySize;                             // Amount of memory really used
+
+    // obsolete output
+
+       int fNOutTracks;
+       int fNOutTrackHits;
+    AliHLTTPCCAOutTrack *fOutTracks; // output array of the reconstructed tracks
+    int *fOutTrackHits;  // output array of ID's of the reconstructed hits
+
+       //Must be last element of this class, user has to make sure to allocate anough memory consecutive to class memory!
+    char* fMemory; // the memory where the pointers above point into
+
 };
 
 #endif
index 22ba97b..1bc93dc 100644 (file)
 #include "AliHLTTPCCAClusterData.h"
 #include "TStopwatch.h"
 
-//If not building GPU Code then build dummy functions to link against
-#ifndef BUILD_GPU
-AliHLTTPCCAGPUTracker::AliHLTTPCCAGPUTracker() : gpuTracker(),GPUMemory(0), DebugLevel(0), OutFile(0), GPUMemSize(0) {}
-AliHLTTPCCAGPUTracker::~AliHLTTPCCAGPUTracker() {}
-int AliHLTTPCCAGPUTracker::InitGPU() {return(0);}
-//template <class T> inline T* AliHLTTPCCAGPUTracker::alignPointer(T* ptr, int alignment) {return(NULL);}
-//bool AliHLTTPCCAGPUTracker::CUDA_FAILED_MSG(cudaError_t error) {return(true);}
-//int AliHLTTPCCAGPUTracker::CUDASync() {return(1);}
-void AliHLTTPCCAGPUTracker::SetDebugLevel(int /*dwLevel*/, std::ostream * /*NewOutFile*/) {};
-int AliHLTTPCCAGPUTracker::Reconstruct(AliHLTTPCCATracker* /*tracker*/) {return(1);}
-int AliHLTTPCCAGPUTracker::ExitGPU() {return(0);}
+#ifdef HLTCA_STANDALONE
+#include <omp.h>
 #endif
 
 AliHLTTPCCAStandaloneFramework &AliHLTTPCCAStandaloneFramework::Instance()
@@ -47,7 +38,7 @@ AliHLTTPCCAStandaloneFramework &AliHLTTPCCAStandaloneFramework::Instance()
 }
 
 AliHLTTPCCAStandaloneFramework::AliHLTTPCCAStandaloneFramework()
-    :  fMerger(), fGPUTracker(), fStatNEvents( 0 ), fUseGPUTracker(false), fGPUDebugLevel(0)
+    : fMerger(), fTracker(), fStatNEvents( 0 ), fDebugLevel(0)
 {
   //* constructor
 
@@ -58,7 +49,7 @@ AliHLTTPCCAStandaloneFramework::AliHLTTPCCAStandaloneFramework()
 }
 
 AliHLTTPCCAStandaloneFramework::AliHLTTPCCAStandaloneFramework( const AliHLTTPCCAStandaloneFramework& )
-    :  fMerger(), fGPUTracker(), fStatNEvents( 0 ), fUseGPUTracker(false), fGPUDebugLevel(0)
+    : fMerger(), fTracker(), fStatNEvents( 0 ), fDebugLevel(0)
 {
   //* dummy
 }
@@ -114,9 +105,9 @@ void AliHLTTPCCAStandaloneFramework::FinishDataReading()
   WriteSettings(outfile);
   outfile.close();
 
-  event_number++;
+  event_number++;*/
   
-  std::ifstream infile(filename, std::ifstream::binary);
+  /*std::ifstream infile(filename, std::ifstream::binary);
   ReadEvent(infile);
   infile.close();*/
 
@@ -127,7 +118,7 @@ void AliHLTTPCCAStandaloneFramework::FinishDataReading()
 
 
 //int
-void AliHLTTPCCAStandaloneFramework::ProcessEvent()
+void AliHLTTPCCAStandaloneFramework::ProcessEvent(int forceSingleSlice)
 {
   // perform the event reconstruction
 
@@ -136,36 +127,38 @@ void AliHLTTPCCAStandaloneFramework::ProcessEvent()
   TStopwatch timer0;
   TStopwatch timer1;
 
-  if (!fUseGPUTracker || fGPUDebugLevel >= 3)
+#ifdef HLTCA_STANDALONE
+  unsigned long long int startTime, endTime, checkTime;
+  unsigned long long int cpuTimers[16], gpuTimers[16], tmpFreq;
+  StandaloneQueryFreq(&tmpFreq);
+  StandaloneQueryTime(&startTime);
+#endif
+
+  if (forceSingleSlice != -1)
   {
-       for ( int iSlice = 0; iSlice < fgkNSlices; iSlice++ ) {
-         fSliceTrackers[iSlice].ReadEvent( &( fClusterData[iSlice] ) );
-      fSliceTrackers[iSlice].Reconstruct();
-       }
-       if (fGPUDebugLevel >= 2) printf("\n");
+       if (fTracker.ProcessSlices(forceSingleSlice, 1, &fClusterData[forceSingleSlice], &fSliceOutput[forceSingleSlice])) return;
   }
-
-  if (fUseGPUTracker)
+  else
   {
-         for ( int iSlice = 0; iSlice < fgkNSlices; iSlice++ ) {
-           fSliceTrackers[iSlice].ReadEvent( &( fClusterData[iSlice] ) );
-               if (fGPUTracker.Reconstruct(&fSliceTrackers[iSlice]))
-               {
-                       printf("Error during GPU Reconstruction!!!\n");
-                       //return(1);
-               }
-         }
-         if (fGPUDebugLevel >= 2) printf("\n");
+       for (int iSlice = 0;iSlice < fgkNSlices;iSlice += fTracker.MaxSliceCount())
+       {
+               if (fTracker.ProcessSlices(iSlice, fTracker.MaxSliceCount(), &fClusterData[iSlice], &fSliceOutput[iSlice])) return;
+       }
   }
 
+#ifdef HLTCA_STANDALONE
+  StandaloneQueryTime(&endTime);
+  StandaloneQueryTime(&checkTime);
+#endif
+
   timer1.Stop();
   TStopwatch timer2;
 
   fMerger.Clear();
-  fMerger.SetSliceParam( fSliceTrackers[0].Param() );
+  fMerger.SetSliceParam( fTracker.Param(0) );
 
   for ( int i = 0; i < fgkNSlices; i++ ) {
-    fMerger.SetSliceData( i, fSliceTrackers[i].Output() );
+    fMerger.SetSliceData( i, &fSliceOutput[i] );
   }
 
   fMerger.Reconstruct();
@@ -177,6 +170,47 @@ void AliHLTTPCCAStandaloneFramework::ProcessEvent()
   fLastTime[1] = timer1.CpuTime();
   fLastTime[2] = timer2.CpuTime();
 
+#ifdef HLTCA_STANDALONE
+  printf("Tracking Time: %lld us\nTime uncertainty: %lld ns\n", (endTime - startTime) * 1000000 / tmpFreq, (checkTime - endTime) * 1000000000 / tmpFreq);
+
+  if (fDebugLevel >= 1)
+  {
+               const char* tmpNames[16] = {"Initialisation", "Neighbours Finder", "Neighbours Cleaner", "Starts Hits Finder", "Start Hits Sorter", "Weight Cleaner", "Reserved", "Tracklet Constructor", "Tracklet Selector", "Write Output", "Unused", "Unused", "Unused", "Unused", "Unused", "Unused"};
+
+               for (int i = 0;i < 10;i++)
+               {
+                       if (i == 6) continue;
+                       cpuTimers[i] = gpuTimers[i] = 0;
+                       for ( int iSlice = 0; iSlice < fgkNSlices;iSlice++)
+                       {
+                               if (forceSingleSlice != -1) iSlice = forceSingleSlice;
+                               cpuTimers[i] += *fTracker.PerfTimer(0, iSlice, i + 1) - *fTracker.PerfTimer(0, iSlice, i);
+                               if (forceSingleSlice != -1 || (fTracker.MaxSliceCount() && (iSlice % fTracker.MaxSliceCount() == 0 || i <= 5)))
+                                       gpuTimers[i] += *fTracker.PerfTimer(1, iSlice, i + 1) - *fTracker.PerfTimer(1, iSlice, i);
+                               if (forceSingleSlice != -1) break;
+                       }
+                       if (forceSingleSlice == -1)
+                       {
+                               cpuTimers[i] /= fgkNSlices;
+                               gpuTimers[i] /= fgkNSlices;
+                       }
+                       cpuTimers[i] *= 1000000;
+                       gpuTimers[i] *= 1000000;
+                       cpuTimers[i] /= tmpFreq;
+                       gpuTimers[i] /= tmpFreq;
+                       cpuTimers[i] /= omp_get_max_threads();
+
+                       printf("Execution Time: Task: %20s ", tmpNames[i]);
+                               printf("CPU: %15lld\t\t", cpuTimers[i]);
+                               printf("GPU: %15lld\t\t", gpuTimers[i]);
+                       if (fDebugLevel >=6 && gpuTimers[i])
+                               printf("Speedup: %4lld%%", cpuTimers[i] * 100 / gpuTimers[i]);
+                       printf("\n");
+               }
+               printf("Execution Time: Task: %20s CPU: %15lld\n", "Merger", (long long int) (timer2.CpuTime() * 1000000));
+  }
+#endif
+
   for ( int i = 0; i < 3; i++ ) fStatTime[i] += fLastTime[i];
 
   //return(0);
@@ -188,7 +222,7 @@ void AliHLTTPCCAStandaloneFramework::WriteSettings( std::ostream &out ) const
   //* write settings to the file
   out << NSlices() << std::endl;
   for ( int iSlice = 0; iSlice < NSlices(); iSlice++ ) {
-    fSliceTrackers[iSlice].Param().WriteSettings( out );
+    fTracker.Param(iSlice).WriteSettings( out );
   }
 }
 
@@ -200,7 +234,7 @@ void AliHLTTPCCAStandaloneFramework::ReadSettings( std::istream &in )
   for ( int iSlice = 0; iSlice < nSlices; iSlice++ ) {
     AliHLTTPCCAParam param;
     param.ReadSettings ( in );
-    fSliceTrackers[iSlice].Initialize( param );
+       fTracker.InitializeSliceParam(iSlice, param);
   }
 }
 
@@ -237,28 +271,4 @@ void AliHLTTPCCAStandaloneFramework::ReadTracks( std::istream &in )
     fStatTime[i] += fLastTime[i];
   }
   //fMerger.Output()->Read( in );
-}
-
-int AliHLTTPCCAStandaloneFramework::InitGPU()
-{
-       if (fUseGPUTracker) return(1);
-       int retVal = fGPUTracker.InitGPU();
-       fUseGPUTracker = retVal == 0;
-       return(retVal);
-}
-
-int AliHLTTPCCAStandaloneFramework::ExitGPU()
-{
-       if (!fUseGPUTracker) return(1);
-       return(fGPUTracker.ExitGPU());
-}
-
-void AliHLTTPCCAStandaloneFramework::SetGPUDebugLevel(int Level, std::ostream *OutFile, std::ostream *GPUOutFile)
-{
-       fGPUTracker.SetDebugLevel(Level, GPUOutFile);
-       fGPUDebugLevel = Level;
-       for (int i = 0;i < fgkNSlices;i++)
-       {
-               fSliceTrackers[i].SetGPUDebugLevel(Level, OutFile);
-       }
-}
+}
\ No newline at end of file
index 175a41f..c9b5edc 100644 (file)
 #define ALIHLTTPCCASTANDALONEFRAMEWORK_H
 
 #include "AliHLTTPCCADef.h"
-#include "AliHLTTPCCATracker.h"
 #include "AliHLTTPCCAMerger.h"
 #include "AliHLTTPCCAClusterData.h"
-#include "AliHLTTPCCAGPUTracker.h"
+#include "AliHLTTPCCATrackerFramework.h"
 #include <iostream>
 #include <fstream>
+#include "TStopwatch.h"
 
 /**
  * @class AliHLTTPCCAStandaloneFramework
@@ -35,7 +35,9 @@ class AliHLTTPCCAStandaloneFramework
 
     static AliHLTTPCCAStandaloneFramework &Instance();
 
-    AliHLTTPCCATracker &SliceTracker( int iSlice )  { return fSliceTrackers[iSlice]; }
+       const AliHLTTPCCAParam &Param ( int iSlice ) const { return(fTracker.Param(iSlice)); }
+       const AliHLTTPCCARow &Row ( int iSlice, int iRow ) const { return(fTracker.Row(iSlice, iRow)); }
+    const AliHLTTPCCASliceOutput &Output( int iSlice ) const { return fSliceOutput[iSlice]; }
     AliHLTTPCCAMerger  &Merger()  { return fMerger; }
     AliHLTTPCCAClusterData &ClusterData( int iSlice ) { return fClusterData[iSlice]; }
 
@@ -59,7 +61,7 @@ class AliHLTTPCCAStandaloneFramework
     /**
      *  perform event reconstruction
      */
-    void ProcessEvent();
+    void ProcessEvent(int forceSingleSlice = -1);
 
 
     int NSlices() const { return fgkNSlices; }
@@ -76,9 +78,19 @@ class AliHLTTPCCAStandaloneFramework
     void ReadEvent( std::istream &in );
     void ReadTracks( std::istream &in );
 
-       int InitGPU();
-       int ExitGPU();
-       void SetGPUDebugLevel(int Level, std::ostream *OutFile = NULL, std::ostream *GPUOutFile = NULL);
+       int InitGPU(int sliceCount = 1, int forceDeviceID = -1) { return(fTracker.InitGPU(sliceCount, forceDeviceID)); }
+       int ExitGPU() { return(fTracker.ExitGPU()); }
+       void SetGPUDebugLevel(int Level, std::ostream *OutFile = NULL, std::ostream *GPUOutFile = NULL) { fDebugLevel = Level; fTracker.SetGPUDebugLevel(Level, OutFile, GPUOutFile); }
+       int SetGPUTrackerOption(char* OptionName, int OptionValue) {return(fTracker.SetGPUTrackerOption(OptionName, OptionValue));}
+       int SetGPUTracker(bool enable) { return(fTracker.SetGPUTracker(enable)); }
+       int GetGPUStatus() { return(fTracker.GetGPUStatus()); }
+
+       int InitializeSliceParam(int iSlice, AliHLTTPCCAParam& param) { return(fTracker.InitializeSliceParam(iSlice, param)); }
+
+#ifdef HLTCA_STANDALONE
+       static inline void StandaloneQueryTime(unsigned long long int *i);
+       static inline void StandaloneQueryFreq(unsigned long long int *i);
+#endif
 
   private:
 
@@ -87,18 +99,39 @@ class AliHLTTPCCAStandaloneFramework
     AliHLTTPCCAStandaloneFramework( const AliHLTTPCCAStandaloneFramework& );
     const AliHLTTPCCAStandaloneFramework &operator=( const AliHLTTPCCAStandaloneFramework& ) const;
 
-    AliHLTTPCCATracker fSliceTrackers[fgkNSlices]; //* array of slice trackers
     AliHLTTPCCAMerger fMerger;  //* global merger
     AliHLTTPCCAClusterData fClusterData[fgkNSlices];
+       AliHLTTPCCASliceOutput fSliceOutput[fgkNSlices];
 
-  AliHLTTPCCAGPUTracker fGPUTracker;
+       AliHLTTPCCATrackerFramework fTracker;
 
     double fLastTime[20]; //* timers
     double fStatTime[20]; //* timers
     int fStatNEvents;    //* n events proceed
 
-  bool fUseGPUTracker; // use the GPU tracker 
-  int fGPUDebugLevel;  // debug level for the GPU code
+       int fDebugLevel;
 };
 
+#ifdef HLTCA_STANDALONE
+       void AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(unsigned long long int *i)
+       {
+       #ifdef R__WIN32
+                 QueryPerformanceCounter((LARGE_INTEGER*) i);
+       #else
+                 timespec t;
+                 clock_gettime(CLOCK_REALTIME, &t);
+                 *i = (unsigned long long int) t.tv_sec * (unsigned long long int) 1000000000 + (unsigned long long int) t.tv_nsec;
+       #endif
+       }
+
+       void AliHLTTPCCAStandaloneFramework::StandaloneQueryFreq(unsigned long long int *i)
+       {
+       #ifdef R__WIN32
+                 QueryPerformanceFrequency((LARGE_INTEGER*) i);
+       #else
+               *i = 1000000000;
+       #endif
+       }
+#endif
+
 #endif
index 85134eb..cbf350d 100644 (file)
@@ -29,31 +29,46 @@ GPUd() void AliHLTTPCCAStartHitsFinder::Thread
 
   if ( iSync == 0 ) {
     if ( iThread == 0 ) {
-      if ( iBlock == 0 ) {
-        CAMath::AtomicExch( tracker.NTracklets(), 0 );
-      }
       s.fNRows = tracker.Param().NRows();
       s.fIRow = iBlock + 1;
       s.fNRowStartHits = 0;
       if ( s.fIRow <= s.fNRows - 4 ) {
         s.fNHits = tracker.Row( s.fIRow ).NHits();
-        if ( s.fNHits >= 10240 ) s.fNHits = 10230;
+        if ( s.fNHits >= ALIHLTTPCCASTARTHITSFINDER_MAX_FROWSTARTHITS ) s.fNHits = ALIHLTTPCCASTARTHITSFINDER_MAX_FROWSTARTHITS - 1;
       } else s.fNHits = -1;
     }
   } else if ( iSync == 1 ) {
     const AliHLTTPCCARow &row = tracker.Row( s.fIRow );
+       const AliHLTTPCCARow &rowUp = tracker.Row( s.fIRow + 2 );
     for ( int ih = iThread; ih < s.fNHits; ih += nThreads ) {
-      if ( ( tracker.HitLinkDownData( row, ih ) < 0 ) && ( tracker.HitLinkUpData( row, ih ) >= 0 ) ) {
+      if (tracker.HitLinkDownData(row, ih) < 0 && tracker.HitLinkUpData(row, ih) >= 0 && tracker.HitLinkUpData(rowUp, tracker.HitLinkUpData(row, ih)) >= 0) {
         int oldNRowStartHits = CAMath::AtomicAdd( &s.fNRowStartHits, 1 );
         s.fRowStartHits[oldNRowStartHits].Set( s.fIRow, ih );
       }
     }
   } else if ( iSync == 2 ) {
     if ( iThread == 0 ) {
-      s.fNOldStartHits = CAMath::AtomicAdd( tracker.NTracklets(), s.fNRowStartHits );
+         int nOffset = CAMath::AtomicAdd( tracker.NTracklets(), s.fNRowStartHits );
+#ifdef HLTCA_GPUCODE
+         if (nOffset + s.fNRowStartHits >= HLTCA_GPU_MAX_TRACKLETS)\r
+         {\r
+               tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_TRACKLET_OVERFLOW;\r
+               CAMath::AtomicExch( tracker.NTracklets(), 0 );\r
+               nOffset = 0;\r
+         }\r
+#endif
+      s.fNOldStartHits = nOffset;
+#ifdef HLTCA_GPU_SORT_STARTHITS
+         tracker.RowStartHitCountOffset()[s.fIRow].x = s.fNRowStartHits;
+         tracker.RowStartHitCountOffset()[s.fIRow].y = nOffset;
+#endif
     }
   } else if ( iSync == 3 ) {
+#ifdef HLTCA_GPU_SORT_STARTHITS
+       AliHLTTPCCAHitId *const startHits = tracker.TrackletTmpStartHits();
+#else
     AliHLTTPCCAHitId *const startHits = tracker.TrackletStartHits();
+#endif
     for ( int ish = iThread; ish < s.fNRowStartHits; ish += nThreads ) {
       startHits[s.fNOldStartHits+ish] = s.fRowStartHits[ish];
     }
index 709278d..1facf3c 100644 (file)
@@ -34,10 +34,7 @@ class AliHLTTPCCAStartHitsFinder
         AliHLTTPCCASharedMemory& operator=( const AliHLTTPCCASharedMemory& /*dummy*/ ) { return *this; }
 #endif
 
-#ifndef CUDA_DEVICE_EMULATION
       protected:
-#endif
-
         int fIRow; // row index
         int fNRows; // n rows
         int fNHits; // n hits in the row
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.cxx
new file mode 100644 (file)
index 0000000..c8a3498
--- /dev/null
@@ -0,0 +1,111 @@
+// @(#) $Id: AliHLTTPCCAStartHitsFinder.cxx 27042 2008-07-02 12:06:02Z richterm $
+// **************************************************************************
+// This file is property of and copyright by the ALICE HLT Project          *
+// ALICE Experiment at CERN, All rights reserved.                           *
+//                                                                          *
+// Primary Authors: David Rohr <drohr@kip.uni-heidelberg.de>                           *
+//                  for The ALICE HLT Project.                              *
+//                                                                          *
+// Permission to use, copy, modify and distribute this software and its     *
+// documentation strictly for non-commercial purposes is hereby granted     *
+// without fee, provided that the above copyright notice appears in all     *
+// copies and that both the copyright notice and this permission notice     *
+// appear in the supporting documentation. The authors make no claims       *
+// about the suitability of this software for any purpose. It is            *
+// provided "as is" without express or implied warranty.                    *
+//                                                                          *
+//***************************************************************************
+
+#include "AliHLTTPCCAStartHitsSorter.h"
+#include "AliHLTTPCCATracker.h"
+
+GPUd() void AliHLTTPCCAStartHitsSorter::Thread
+( int nBlocks, int nThreads, int iBlock, int iThread, int iSync,
+  AliHLTTPCCASharedMemory &s, AliHLTTPCCATracker &tracker )
+{
+       //Sorts the Start Hits by Row Index and create RowBlock Data
+  if ( iSync == 0 ) {
+    if ( iThread == 0 ) {
+               const int gpuFixedBlockCount = tracker.GPUParametersConst()->fGPUFixedBlockCount;
+         const int tmpNRows = tracker.Param().NRows() - 6;
+         int nRows = iBlock == 29 ? (tmpNRows - (tmpNRows / 30) * 29) : (tmpNRows / 30);
+         int nStartRow = (tmpNRows / 30) * iBlock + 1;
+      int startOffset = 0;
+         int startOffset2 = 0;
+         int previousBlockEndTracklet = 0;
+         int nCurrentBlock = 0;
+
+      for (int ir = 1;ir < tracker.Param().NRows() - 5;ir++)
+         {
+           if (ir < nStartRow)
+                       startOffset2 += tracker.RowStartHitCountOffset()[ir].x;
+
+               if (iBlock == nBlocks - 1 && nCurrentBlock < gpuFixedBlockCount)
+               {
+                       startOffset += tracker.RowStartHitCountOffset()[ir].x;
+                       for (int i = previousBlockEndTracklet + HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS;i <= startOffset;i += HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+                       {
+                               if (previousBlockEndTracklet / (HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS) != i / (HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS))
+                               {
+                                       tracker.BlockStartingTracklet()[nCurrentBlock].x = previousBlockEndTracklet;
+                                       tracker.BlockStartingTracklet()[nCurrentBlock++].y = HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS;
+                                       previousBlockEndTracklet += HLTCA_GPU_THREAD_COUNT - HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS;
+                                       if (nCurrentBlock == gpuFixedBlockCount)
+                                       {
+                                               break;
+                                       }
+                               }
+                       }
+                       if ((ir + 1) % HLTCA_GPU_SCHED_ROW_STEP == 0 && nCurrentBlock < gpuFixedBlockCount)
+                       {
+                               if (previousBlockEndTracklet != startOffset)
+                               {
+                                       tracker.BlockStartingTracklet()[nCurrentBlock].x = previousBlockEndTracklet;
+                                       tracker.BlockStartingTracklet()[nCurrentBlock++].y = startOffset - previousBlockEndTracklet;
+                                       previousBlockEndTracklet = startOffset;
+                               }
+                       }
+                       if (nCurrentBlock == gpuFixedBlockCount)
+                       {
+                               tracker.GPUParameters()->fScheduleFirstDynamicTracklet = previousBlockEndTracklet;
+                       }
+               }
+         }
+         if (iBlock == nBlocks - 1)
+         {
+           if (nCurrentBlock < gpuFixedBlockCount)
+               {
+                       tracker.BlockStartingTracklet()[nCurrentBlock].x = previousBlockEndTracklet;
+                       tracker.BlockStartingTracklet()[nCurrentBlock++].y = startOffset - previousBlockEndTracklet;
+                       tracker.GPUParameters()->fScheduleFirstDynamicTracklet = startOffset;
+               }
+               for (int i = nCurrentBlock;i < HLTCA_GPU_BLOCK_COUNT;i++)
+               {
+                       tracker.BlockStartingTracklet()[i].x = 0;
+                       tracker.BlockStartingTracklet()[i].y = 0;                       
+               }
+         }
+         s.fStartOffset = startOffset2;
+         s.fNRows = nRows;
+      s.fStartRow = nStartRow;
+    }
+  } else if ( iSync == 1 ) {
+       int startOffset = s.fStartOffset;
+    for (int ir = 0;ir < s.fNRows;ir++)
+       {
+               AliHLTTPCCAHitId *const startHits = tracker.TrackletStartHits();
+               AliHLTTPCCAHitId *const tmpStartHits = tracker.TrackletTmpStartHits();
+               const int tmpLen = tracker.RowStartHitCountOffset()[ir + s.fStartRow].x;                        //Length of hits in row stored by StartHitsFinder
+               const int tmpOffset = tracker.RowStartHitCountOffset()[ir + s.fStartRow].y;                     //Offset of first hit in row of unsorted array by StartHitsFinder
+               if (iThread == 0)
+                       tracker.RowStartHitCountOffset()[ir + s.fStartRow].z = startOffset;                             //Store New Offset Value of sorted array
+
+               for (int j = iThread;j < tmpLen;j += nThreads)
+               {
+                       startHits[startOffset + j] = tmpStartHits[tmpOffset + j];
+               }
+               startOffset += tmpLen;
+    }
+  }
+}
+
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCAStartHitsSorter.h
new file mode 100644 (file)
index 0000000..0969c26
--- /dev/null
@@ -0,0 +1,50 @@
+//-*- Mode: C++ -*-
+// ************************************************************************
+// This file is property of and copyright by the ALICE HLT Project        *
+// ALICE Experiment at CERN, All rights reserved.                         *
+// See cxx source for full Copyright notice                               *
+//                                                                        *
+//*************************************************************************
+
+#ifndef ALIHLTTPCCASTARTHITSSORTER_H
+#define ALIHLTTPCCASTARTHITSSORTER_H
+
+#include "AliHLTTPCCADef.h"
+#include "AliHLTTPCCAHitId.h"
+
+class AliHLTTPCCATracker;
+
+/**
+ * @class AliHLTTPCCAStartHitsSorter
+ *
+ */
+class AliHLTTPCCAStartHitsSorter
+{
+  public:
+    class AliHLTTPCCASharedMemory
+    {
+        friend class AliHLTTPCCAStartHitsSorter;
+      public:
+#if !defined(HLTCA_GPUCODE)
+        AliHLTTPCCASharedMemory()
+            : fStartRow( 0 ), fNRows( 0 ), fStartOffset( 0 ) {}
+
+        AliHLTTPCCASharedMemory( const AliHLTTPCCASharedMemory& /*dummy*/ )
+            : fStartRow( 0 ), fNRows( 0 ), fStartOffset( 0 ) {}
+        AliHLTTPCCASharedMemory& operator=( const AliHLTTPCCASharedMemory& /*dummy*/ ) { return *this; }
+#endif
+
+      protected:
+        int fStartRow;         // start row index
+        int fNRows;                    // number of rows to process
+               int fStartOffset;       //start offset for hits sorted by this block
+    };
+
+    GPUd() static int NThreadSyncPoints() { return 1; }
+
+    GPUd() static void Thread( int nBlocks, int nThreads, int iBlock, int iThread, int iSync,
+                               AliHLTTPCCASharedMemory &smem, AliHLTTPCCATracker &tracker );
+};
+
+
+#endif
index 714e974..d81fcc4 100644 (file)
@@ -38,10 +38,7 @@ class AliHLTTPCCATrack
     GPUhd() void SetFirstHitID( int v )          { fFirstHitID = v; }
     GPUhd() void SetParam( AliHLTTPCCATrackParam v ) { fParam = v; };
 
-#ifndef CUDA_DEVICE_EMULATION
   private:
-#endif
-
     bool fAlive;       // flag for mark tracks used by the track merger
     int  fFirstHitID; // index of the first track cell in the track->cell pointer array
     int  fNHits;      // number of track cells
index 1656f8c..365cf2c 100644 (file)
@@ -57,10 +57,7 @@ class AliHLTTPCCATrackLinearisation
     GPUd() void SetDzDs( float v )  {  fDzDs   = v; }
     GPUd() void SetQPt( float v )   {  fQPt = v; }
 
-#ifndef CUDA_DEVICE_EMULATION
   private:
-#endif
-
     float fSinPhi; // SinPhi
     float fCosPhi; // CosPhi
     float fDzDs;   // DzDs
index 74f5bbe..4e84d3f 100644 (file)
@@ -71,6 +71,7 @@ class AliHLTTPCCATrackParam
 
     const float *GetPar() const { return fP; }
     const float *GetCov() const { return fC; }
+       GPUd() float GetCov(int i) {return fC[i]; }
 
     GPUhd() void SetPar( int i, float v ) { fP[i] = v; }
     GPUhd() void SetCov( int i, float v ) { fC[i] = v; }
@@ -136,10 +137,10 @@ class AliHLTTPCCATrackParam
 
     GPUd() void Print() const;
 
-#ifndef CUDA_DEVICE_EMULATION
   private:
-#endif
-
+       //WARNING, Track Param Data is copied in the GPU Tracklet Constructor element by element instead of using copy constructor!!!
+       //This is neccessary for performance reasons!!!
+       //Changes to Elements of this class therefore must also be applied to TrackletConstructor!!!
     float fX;      // x position
     float fSignCosPhi; // sign of cosPhi
     float fP[5];   // 'active' track parameters: Y, Z, SinPhi, DzDs, q/Pt
index b745fe9..a2e442c 100644 (file)
 
 #include "AliHLTTPCCATrackParam.h"
 
+#include "AliHLTTPCCAGPUConfig.h"
+#include "AliHLTTPCCAGPUTracker.h"
+
 #if !defined(HLTCA_GPUCODE)
 #include <iostream>
+#include <string.h>
 #endif
 
 //#define DRAW1
 #include "AliHLTTPCCAPerformance.h"
 #endif
 
+#ifdef HLTCA_STANDALONE
+#include "AliHLTTPCCAStandaloneFramework.h"
+#endif
 
 ClassImp( AliHLTTPCCATracker )
 
-GPUd() AliHLTTPCCATracker::~AliHLTTPCCATracker()
+
+#if !defined(HLTCA_GPUCODE)
+
+AliHLTTPCCATracker::~AliHLTTPCCATracker()
 {
   // destructor
        if (!fIsGPUTracker)
        {
-               delete[] fCommonMemory;
-               delete[] fHitMemory;
-               delete[] fTrackMemory;
+               if (fCommonMem) delete fCommonMem;
+               if (fHitMemory) delete[] fHitMemory;
+               if (fTrackletMemory) delete[] fTrackletMemory;
+               if (fTrackMemory) delete[] fTrackMemory;
+               fCommonMem = NULL;
+               fHitMemory = fTrackMemory = NULL;
        }
 }
 
-#if !defined(HLTCA_GPUCODE)
-
-
 // ----------------------------------------------------------------------------------
 void AliHLTTPCCATracker::Initialize( const AliHLTTPCCAParam &param )
 {
@@ -94,148 +104,236 @@ void AliHLTTPCCATracker::StartEvent()
 void AliHLTTPCCATracker::SetGPUTracker()
 {
        fIsGPUTracker = true;
+       fData.SetGpuSliceData();
 }
 
 char* AliHLTTPCCATracker::SetGPUTrackerCommonMemory(char* pGPUMemory)
 {
-       fCommonMemory = (char*) pGPUMemory;
-       SetPointersCommon();
-       return(pGPUMemory + fCommonMemorySize);
+       fCommonMem = (commonMemoryStruct*) pGPUMemory;
+       return(pGPUMemory + sizeof(commonMemoryStruct));
 }
 
 
-char* AliHLTTPCCATracker::SetGPUTrackerHitsMemory(char* pGPUMemory, int MaxNHits )
+char* AliHLTTPCCATracker::SetGPUTrackerHitsMemory(char* pGPUMemory, int MaxNHits)
 {
        fHitMemory = (char*) pGPUMemory;
        SetPointersHits(MaxNHits);
-       return(pGPUMemory + fHitMemorySize);
+       pGPUMemory += fHitMemorySize;
+       AssignMemory(fTrackletTmpStartHits, pGPUMemory, NHitsTotal());
+       AssignMemory(fRowStartHitCountOffset, pGPUMemory, Param().NRows());
+
+       return(pGPUMemory);
 }
 
+char* AliHLTTPCCATracker::SetGPUTrackerTrackletsMemory(char* pGPUMemory, int MaxNTracks)
+{
+       fTrackletMemory = (char*) pGPUMemory;
+       SetPointersTracklets(MaxNTracks);
+       pGPUMemory += fTrackletMemorySize;
+       AssignMemory(fGPUTrackletTemp, pGPUMemory, MaxNTracks);
+       AssignMemory(fRowBlockTracklets, pGPUMemory, MaxNTracks * 2 * (Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1));
+       AssignMemory(fRowBlockPos, pGPUMemory, 2 * (Param().NRows() / HLTCA_GPU_SCHED_ROW_STEP + 1));
+       AssignMemory(fBlockStartingTracklet, pGPUMemory, HLTCA_GPU_BLOCK_COUNT);
+
+       return(pGPUMemory);
+}
 
 char* AliHLTTPCCATracker::SetGPUTrackerTracksMemory(char* pGPUMemory, int MaxNTracks, int MaxNHits )
 {
        fTrackMemory = (char*) pGPUMemory;
        SetPointersTracks(MaxNTracks, MaxNHits);
-       return(pGPUMemory + fTrackMemorySize);
+       pGPUMemory += fTrackMemorySize;
+
+       return(pGPUMemory);
 }
 
-void AliHLTTPCCATracker::DumpLinks(std::ostream &out)
+void AliHLTTPCCATracker::DumpSliceData(std::ostream &out)
 {
+       out << "Slice Data:" << endl;
        for (int i = 0;i < Param().NRows();i++)
        {
                out << "Row: " << i << endl;
                for (int j = 0;j < Row(i).NHits();j++)
                {
-                       out << HitLinkUpData(Row(i), j) << ", ";
+                       if (j && j % 16 == 0) out << std::endl;
+                       out << j << '-' << Data().HitDataY(Row(i), j) << '-' << Data().HitDataZ(Row(i), j) << ", ";
                }
                out << endl;
        }
 }
 
-void AliHLTTPCCATracker::DumpStartHits(std::ostream &out)
+void AliHLTTPCCATracker::DumpLinks(std::ostream &out)
 {
-       for (int j = 0;j < Param().NRows();j++)
+       out << "Hit Links:" << endl;
+       for (int i = 0;i < Param().NRows();i++)
        {
-               for (int i = 0;i < *NTracklets();i++)
+               out << "Row: " << i << endl;
+               for (int j = 0;j < Row(i).NHits();j++)
                {
-                       if (TrackletStartHit(i).RowIndex() == j)
-                               out << TrackletStartHit(i).RowIndex() << "-" << TrackletStartHit(i).HitIndex() << endl;
+                       if (j && j % 32 == 0) out << endl;
+                       out << HitLinkUpData(Row(i), j) << "/" << HitLinkDownData(Row(i), j) << ", ";
                }
+               out << endl;
        }
-       out << endl;
 }
 
-void AliHLTTPCCATracker::DumpTrackHits(std::ostream &out)
+void AliHLTTPCCATracker::DumpHitWeights(std::ostream &out)
 {
-       for (int k = 0;k < Param().NRows();k++)
+    out << "Hit Weights:" << endl;
+    for (int i = 0;i < Param().NRows();i++)
+    {
+       out << "Row: " << i << ":" << endl;
+       for (int j = 0;j < Row(i).NHits();j++)
        {
-               for (int j = 0;j < *NTracks();j++)
-               {
-                       if (Tracks()[j].NHits() == 0 || !Tracks()[j].Alive()) continue;
-                       if (TrackHits()[Tracks()[j].FirstHitID()].RowIndex() == k)
-                       {
-                               for (int i = 0;i < Tracks()[j].NHits();i++)
-                               {
-                                       out << TrackHits()[Tracks()[j].FirstHitID() + i].RowIndex() << "-" << TrackHits()[Tracks()[j].FirstHitID() + i].HitIndex() << ", ";
-                               }
-                               out << "(Track: " << j << ")" << endl;
-                       }
-               }
+           if (j && j % 32 == 0) out << endl;
+           out << HitWeight(Row(i), j) << ", ";
        }
+       out << endl;
+    }
+}
+
+int AliHLTTPCCATracker::starthitSortComparison(const void*a, const void* b)
+{
+       AliHLTTPCCAHitId* aa = (AliHLTTPCCAHitId*) a;
+       AliHLTTPCCAHitId* bb = (AliHLTTPCCAHitId*) b;
+
+       if (aa->RowIndex() != bb->RowIndex()) return(aa->RowIndex() - bb->RowIndex());
+       return(aa->HitIndex() - bb->HitIndex());
 }
 
-int trackletSortComparison(const void* a, const void* b)
+void AliHLTTPCCATracker::DumpStartHits(std::ostream &out)
 {
-       const AliHLTTPCCATracklet* aa = (AliHLTTPCCATracklet*) a;
-       const AliHLTTPCCATracklet* bb = (AliHLTTPCCATracklet*) b;
-       if (aa->NHits() == 0) return(-1);
-       if (bb->NHits() == 0) return(1);
-       if (aa->FirstRow() != bb->FirstRow())
+       out << "Start Hits: (" << *NTracklets() << ")" << endl;
+#ifdef HLTCA_GPU_SORT_DUMPDATA
+       qsort(TrackletStartHits(), *NTracklets(), sizeof(AliHLTTPCCAHitId), starthitSortComparison);
+#endif
+       for (int i = 0;i < *NTracklets();i++)
        {
-               return(aa->FirstRow() - bb->FirstRow());
+               out << TrackletStartHit(i).RowIndex() << "-" << TrackletStartHit(i).HitIndex() << endl;
        }
-       for (int i = aa->FirstRow();i <= aa->LastRow();i++)
+       out << endl;
+}
+
+void AliHLTTPCCATracker::DumpTrackHits(std::ostream &out)
+{
+       out << "Tracks: (" << *NTracks() << ")" << endl;
+#ifdef HLTCA_GPU_SORT_DUMPDATA
+       for (int k = 0;k < Param().NRows();k++)
        {
-               if (i >= bb->LastRow()) return(-1);
-               if (aa->RowHit(i) != bb->RowHit(i))
+               for (int l = 0;l < Row(k).NHits();l++)
                {
-                       return(aa->RowHit(i) - bb->RowHit(i));
-               }
+#endif
+                       for (int j = 0;j < *NTracks();j++)
+                       {
+                               if (Tracks()[j].NHits() == 0 || !Tracks()[j].Alive()) continue;
+#ifdef HLTCA_GPU_SORT_DUMPDATA
+                               if (TrackHits()[Tracks()[j].FirstHitID()].RowIndex() == k && TrackHits()[Tracks()[j].FirstHitID()].HitIndex() == l)
+                               {
+#endif
+                                       for (int i = 0;i < Tracks()[j].NHits();i++)
+                                       {
+                                               out << TrackHits()[Tracks()[j].FirstHitID() + i].RowIndex() << "-" << TrackHits()[Tracks()[j].FirstHitID() + i].HitIndex() << ", ";
+                                       }
+                                       out << "(Track: " << j << ")" << endl;
+#ifdef HLTCA_GPU_SORT_DUMPDATA
+                               }
+                       }       
+#endif
+               }       
+#ifdef HLTCA_GPU_SORT_DUMPDATA
        }
-       return(0);
+#endif
 }
 
 void AliHLTTPCCATracker::DumpTrackletHits(std::ostream &out)
 {
-       qsort(Tracklets(), *NTracklets(), sizeof(AliHLTTPCCATracklet), trackletSortComparison);
-       for (int k = 0;k < Param().NRows();k++)
+       out << "Tracklets: (" << *NTracklets() << ")" << endl;
+#ifdef HLTCA_GPU_SORT_DUMPDATA
+       AliHLTTPCCAHitId* tmpIds = new AliHLTTPCCAHitId[*NTracklets()];
+       AliHLTTPCCATracklet* tmpTracklets = new AliHLTTPCCATracklet[*NTracklets()];
+       memcpy(tmpIds, TrackletStartHits(), *NTracklets() * sizeof(AliHLTTPCCAHitId));
+       memcpy(tmpTracklets, Tracklets(), *NTracklets() * sizeof(AliHLTTPCCATracklet));
+#ifdef EXTERN_ROW_HITS
+       int* tmpHits = new int[*NTracklets() * Param().NRows()];
+       memcpy(tmpHits, TrackletRowHits(), *NTracklets() * Param().NRows() * sizeof(int));
+#endif
+       qsort(TrackletStartHits(), *NTracklets(), sizeof(AliHLTTPCCAHitId), starthitSortComparison);
+       for (int i = 0;i < *NTracklets();i++)
        {
                for (int j = 0;j < *NTracklets();j++)
                {
-                       if (Tracklets()[j].NHits() == 0) continue;
-                       if (Tracklets()[j].LastRow() > Tracklets()[j].FirstRow() && (Tracklets()[j].FirstRow() >= Param().NRows() || Tracklets()[j].LastRow() >= Param().NRows()))
-                       {
-                               printf("\nError: First %d Last %d Num %d", Tracklets()[j].FirstRow(), Tracklets()[j].LastRow(), Tracklets()[j].NHits());
-                       }
-                       else if (Tracklets()[j].NHits() && Tracklets()[j].FirstRow() == k && Tracklets()[j].LastRow() > Tracklets()[j].FirstRow())
+                       if (tmpIds[i].RowIndex() == TrackletStartHit(j).RowIndex() && tmpIds[i].HitIndex() == TrackletStartHit(j).HitIndex())
                        {
-                               for (int i = Tracklets()[j].FirstRow();i <= Tracklets()[j].LastRow();i++)
+                               memcpy(&Tracklets()[j], &tmpTracklets[i], sizeof(AliHLTTPCCATracklet));
+#ifdef EXTERN_ROW_HITS
+                               if (tmpTracklets[i].NHits())
                                {
-                                       if (Tracklets()[j].RowHit(i) != -1)
-                                               out << i << "-" << Tracklets()[j].RowHit(i) << ", ";
+                                       for (int k = tmpTracklets[i].FirstRow();k <= tmpTracklets[i].LastRow();k++)
+                                       {
+                                               fTrackletRowHits[k * *NTracklets() + j] = tmpHits[k * *NTracklets() + i];
+                                       }
                                }
-                               out << endl;
+#endif
+                               break;
+                       }
+               }
+       }
+       delete[] tmpIds;
+       delete[] tmpTracklets;
+#ifdef EXTERN_ROW_HITS
+       delete[] tmpHits;
+#endif
+#endif
+       for (int j = 0;j < *NTracklets();j++)
+       {
+               out << "Tracklet " << j << " (Hits: " << setw(3) << Tracklets()[j].NHits() << ", Start: " << setw(3) << TrackletStartHit(j).RowIndex() << "-" << setw(3) << TrackletStartHit(j).HitIndex() << ") ";
+               if (Tracklets()[j].NHits() == 0);
+               else if (Tracklets()[j].LastRow() > Tracklets()[j].FirstRow() && (Tracklets()[j].FirstRow() >= Param().NRows() || Tracklets()[j].LastRow() >= Param().NRows()))
+               {
+#ifdef HLTCA_STANDALONE
+                       printf("\nError: First %d Last %d Num %d", Tracklets()[j].FirstRow(), Tracklets()[j].LastRow(), Tracklets()[j].NHits());
+#endif
+               }
+               else if (Tracklets()[j].NHits() && Tracklets()[j].LastRow() > Tracklets()[j].FirstRow())
+               {
+                       for (int i = Tracklets()[j].FirstRow();i <= Tracklets()[j].LastRow();i++)
+                       {
+                               //if (Tracklets()[j].RowHit(i) != -1)
+#ifdef EXTERN_ROW_HITS
+                                       out << i << "-" << fTrackletRowHits[i * fCommonMem->fNTracklets + j] << ", ";
+#else
+                                       out << i << "-" << Tracklets()[j].RowHit(i) << ", ";
+#endif
                        }
                }
+               out << endl;
        }
 }
 
 
-void  AliHLTTPCCATracker::SetupCommonMemory()
+void AliHLTTPCCATracker::SetupCommonMemory()
 {
   // set up common memory
 
   if (!fIsGPUTracker)
   {
-    if ( !fCommonMemory ) {
-      SetPointersCommon(); // just to calculate the size
+    if ( !fCommonMem ) {
       // the 1600 extra bytes are not used unless fCommonMemorySize increases with a later event
-      fCommonMemory = reinterpret_cast<char*> ( new uint4 [ fCommonMemorySize/sizeof( uint4 ) + 100] );
-      SetPointersCommon();// set pointers
+      //fCommonMemory = reinterpret_cast<char*> ( new uint4 [ fCommonMemorySize/sizeof( uint4 ) + 100] );
+         fCommonMem = new commonMemoryStruct;
     }
 
-    delete[] fHitMemory;
-    delete[] fTrackMemory;
-    fHitMemory = 0;
-    fTrackMemory = 0;
+       if (fHitMemory) delete[] fHitMemory;
+       if (fTrackletMemory) delete[] fTrackletMemory;
+       if (fTrackMemory) delete[] fTrackMemory;
   }
 
+  fHitMemory = fTrackletMemory = fTrackMemory = 0;
+
   fData.Clear();
-  *fNTracklets = 0;
-  *fNTracks = 0 ;
-  *fNTrackHits = 0;
-  *fNOutTracks = 0;
-  *fNOutTrackHits = 0;
+  fCommonMem->fNTracklets = 0;
+  fCommonMem->fNTracks = 0 ;
+  fCommonMem->fNTrackHits = 0;
 }
 
 void AliHLTTPCCATracker::ReadEvent( AliHLTTPCCAClusterData *clusterData )
@@ -248,36 +346,16 @@ void AliHLTTPCCATracker::ReadEvent( AliHLTTPCCAClusterData *clusterData )
 
   //* Convert input hits, create grids, etc.
   fData.InitFromClusterData( *clusterData );
-
   {
-    SetPointersHits( fData.NumberOfHits() ); // to calculate the size
-    fHitMemory = reinterpret_cast<char*> ( new uint4 [ fHitMemorySize/sizeof( uint4 ) + 100] );
+    if (!fIsGPUTracker)
+       {
+               SetPointersHits( fData.NumberOfHits() ); // to calculate the size
+               fHitMemory = reinterpret_cast<char*> ( new uint4 [ fHitMemorySize/sizeof( uint4 ) + 100] );
+       }
     SetPointersHits( fData.NumberOfHits() ); // set pointers for hits
-    *fNTracklets = 0;
-    *fNTracks = 0 ;
-    *fNOutTracks = 0;
-    *fNOutTrackHits = 0;
   }
 }
 
-
-GPUhd() void  AliHLTTPCCATracker::SetPointersCommon()
-{
-  // set all pointers to the event memory
-
-  char *mem = fCommonMemory;
-  AssignMemory( fNTracklets, mem, 1 );
-  AssignMemory( fNTracks, mem, 1 );
-  AssignMemory( fNTrackHits, mem, 1 );
-  AssignMemory( fNOutTracks, mem, 1 );
-  AssignMemory( fNOutTrackHits, mem, 1 );
-
-  // calculate the size
-
-  fCommonMemorySize = mem - fCommonMemory;
-}
-
-
 GPUhd() void  AliHLTTPCCATracker::SetPointersHits( int MaxNHits )
 {
   // set all pointers to the event memory
@@ -286,43 +364,42 @@ GPUhd() void  AliHLTTPCCATracker::SetPointersHits( int MaxNHits )
 
   // extra arrays for tpc clusters
 
-  AssignMemory( fTrackletStartHits, mem, MaxNHits );
+#ifdef HLTCA_GPU_SORT_STARTHITS_2
+  AssignMemory( fTrackletStartHits, mem, MaxNHits + 32);
+#else
+  AssignMemory( fTrackletStartHits, mem, MaxNHits);
+#endif
+
+  // calculate the size
 
-  // arrays for track hits
+  fHitMemorySize = mem - fHitMemory;
+}
 
-  AssignMemory( fTrackHits, mem, 10 * MaxNHits );
+GPUhd() void  AliHLTTPCCATracker::SetPointersTracklets( int MaxNTracklets )
+{
+  // set all pointers to the tracklets memory
+  char *mem = fTrackletMemory;
 
-  AssignMemory( fOutTrackHits, mem, 10 * MaxNHits );
+  // memory for tracklets
 
-  // calculate the size
+  AssignMemory( fTracklets, mem, MaxNTracklets );
+#ifdef EXTERN_ROW_HITS
+  AssignMemory( fTrackletRowHits, mem, MaxNTracklets * Param().NRows());
+#endif
 
-  fHitMemorySize = mem - fHitMemory;
+  fTrackletMemorySize = mem - fTrackletMemory;
 }
 
 
 GPUhd() void  AliHLTTPCCATracker::SetPointersTracks( int MaxNTracks, int MaxNHits )
 {
   // set all pointers to the tracks memory
-
   char *mem = fTrackMemory;
 
-  // memory for tracklets
-
-  AssignMemory( fTracklets, mem, MaxNTracks );
-
   // memory for selected tracks
 
   AssignMemory( fTracks, mem, MaxNTracks );
-
-  // memory for output
-
-  AlignTo < sizeof( void * ) > ( mem );
-  fOutput = reinterpret_cast<AliHLTTPCCASliceOutput *>( mem );
-  mem += AliHLTTPCCASliceOutput::EstimateSize( MaxNTracks, MaxNHits );
-
-  // memory for output tracks
-
-  AssignMemory( fOutTracks, mem, MaxNTracks );
+  AssignMemory( fTrackHits, mem, 2 * MaxNHits );
 
   // calculate the size
 
@@ -362,14 +439,26 @@ void AliHLTTPCCATracker::RunStartHitsFinder()
 
 void AliHLTTPCCATracker::RunTrackletConstructor()
 {
-  AliHLTTPCCAProcess1<AliHLTTPCCATrackletConstructor>( 1, TRACKLET_CONSTRUCTOR_NMEMTHREDS + *fNTracklets, *this );
+    AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorNewCPU(*this);
 }
 
 void AliHLTTPCCATracker::RunTrackletSelector()
 {
-  AliHLTTPCCAProcess<AliHLTTPCCATrackletSelector>( 1, *fNTracklets, *this );
+  AliHLTTPCCAProcess<AliHLTTPCCATrackletSelector>( 1, fCommonMem->fNTracklets, *this );
 }
 
+#ifdef HLTCA_STANDALONE
+void AliHLTTPCCATracker::StandalonePerfTime(int i)
+{
+  if (fGPUDebugLevel >= 1)
+  {
+         AliHLTTPCCAStandaloneFramework::StandaloneQueryTime(&fPerfTimers[i]);
+  }
+}
+#else
+void AliHLTTPCCATracker::StandalonePerfTime(int /*i*/) {}
+#endif
+
 GPUh() void AliHLTTPCCATracker::Reconstruct()
 {
   //* reconstruction of event
@@ -387,6 +476,11 @@ GPUh() void AliHLTTPCCATracker::Reconstruct()
   //if( fParam.ISlice()<1 ) return; //SG!!!
 
   TStopwatch timer0;
+
+  StandalonePerfTime(0);
+
+  fOutput->Clear();
+
   if (CheckEmptySlice()) return;
 
 #ifdef DRAW1
@@ -403,23 +497,25 @@ GPUh() void AliHLTTPCCATracker::Reconstruct()
   }
 #endif
 
-  *fNTracks = 0;
-  *fNTracklets = 0;
+       fCommonMem->fNTracklets = fCommonMem->fNTracks = fCommonMem->fNTrackHits = 0;
 
 #if !defined(HLTCA_GPUCODE)
 
-  if (fGPUDebugLevel >= 3)
+  /*if (fGPUDebugLevel >= 6)
   {
          *fGPUDebugOut << endl << endl << "Slice: " << Param().ISlice() << endl;
-  }
+         *fGPUDebugOut << "Slice Data:" << endl;
+         DumpSliceData(*fGPUDebugOut);
+  }*/
+
+  StandalonePerfTime(1);
 
   RunNeighboursFinder();
 
-  if (fGPUDebugLevel >= 3)
-  {
-         *fGPUDebugOut << "Neighbours Finder:" << endl;
-         DumpLinks(*fGPUDebugOut);
-  }
+  StandalonePerfTime(2);
+
+  if (fGPUDebugLevel >= 6) DumpLinks(*fGPUDebugOut);
+  
 #ifdef HLTCA_INTERNAL_PERFORMANCE
   //if( Param().ISlice()<=2 )
   //AliHLTTPCCAPerformance::Instance().LinkPerformance( Param().ISlice() );
@@ -435,65 +531,66 @@ GPUh() void AliHLTTPCCATracker::Reconstruct()
 
   RunNeighboursCleaner();
 
-  if (fGPUDebugLevel >= 3)
-  {
-         *fGPUDebugOut << "Neighbours Cleaner:" << endl;
-         DumpLinks(*fGPUDebugOut);
-  }
+  StandalonePerfTime(3);
+
+  if (fGPUDebugLevel >= 6) DumpLinks(*fGPUDebugOut);
 
   RunStartHitsFinder();
 
-  if (fGPUDebugLevel >= 3)
-  {
-         *fGPUDebugOut << "Start Hits: (" << *fNTracklets << ")" << endl;
-         DumpStartHits(*fGPUDebugOut);
-  }
-  
-  if (fGPUDebugLevel >= 2) printf("%3d ", *fNTracklets);
+  StandalonePerfTime(4);
+  StandalonePerfTime(5);
 
+  if (fGPUDebugLevel >= 6) DumpStartHits(*fGPUDebugOut);
+  
   fData.ClearHitWeights();
 
-  SetPointersTracks( *fNTracklets * 2, NHitsTotal() ); // to calculate the size
+  SetPointersTracklets( fCommonMem->fNTracklets * 2 ); // to calculate the size
+  fTrackletMemory = reinterpret_cast<char*> ( new uint4 [ fTrackletMemorySize/sizeof( uint4 ) + 100] );
+  SetPointersTracklets( fCommonMem->fNTracklets * 2 ); // set pointers for hits
+
+  SetPointersTracks( fCommonMem->fNTracklets * 2, NHitsTotal() ); // to calculate the size
   fTrackMemory = reinterpret_cast<char*> ( new uint4 [ fTrackMemorySize/sizeof( uint4 ) + 100] );
-  SetPointersTracks( *fNTracklets * 2, NHitsTotal() ); // set pointers for hits
+  SetPointersTracks( fCommonMem->fNTracklets * 2, NHitsTotal() ); // set pointers for hits
+
+  StandalonePerfTime(6);
+  StandalonePerfTime(7);
 
   RunTrackletConstructor();
 
-  if (fGPUDebugLevel >= 3)
-  {
-         *fGPUDebugOut << "Tracklet Hits:" << endl;
-         DumpTrackletHits(*fGPUDebugOut);
-  }
+  StandalonePerfTime(8);
+
+  if (fGPUDebugLevel >= 6) DumpTrackletHits(*fGPUDebugOut);
+  if (fGPUDebugLevel >= 6) DumpHitWeights(*fGPUDebugOut);
 
   //std::cout<<"Slice "<<Param().ISlice()<<": NHits="<<NHitsTotal()<<", NTracklets="<<*NTracklets()<<std::endl;
 
   RunTrackletSelector();
 
+  StandalonePerfTime(9);
+
   //std::cout<<"Slice "<<Param().ISlice()<<": N start hits/tracklets/tracks = "<<nStartHits<<" "<<nStartHits<<" "<<*fNTracks<<std::endl;
 
-  if (fGPUDebugLevel >= 3)
-  {
-         *fGPUDebugOut << "Track Hits: (" << *NTracks() << ")" << endl;
-         DumpTrackHits(*fGPUDebugOut);
-  }
+  if (fGPUDebugLevel >= 6) DumpTrackHits(*fGPUDebugOut);
 
   //std::cout<<"Memory used for slice "<<fParam.ISlice()<<" : "<<fCommonMemorySize/1024./1024.<<" + "<<fHitMemorySize/1024./1024.<<" + "<<fTrackMemorySize/1024./1024.<<" = "<<( fCommonMemorySize+fHitMemorySize+fTrackMemorySize )/1024./1024.<<" Mb "<<std::endl;
 
   WriteOutput();
 
+  StandalonePerfTime(10);
+
 #endif
 
 #ifdef DRAW1
   {
     AliHLTTPCCADisplay &disp = AliHLTTPCCADisplay::Instance();
     AliHLTTPCCATracker &slice = *this;
-    std::cout << "N out tracks = " << *slice.NOutTracks() << std::endl;
+    std::cout << "N out tracks = " << slice.NOutTracks() << std::endl;
     AliHLTTPCCADisplay::Instance().SetSliceView();
     AliHLTTPCCADisplay::Instance().SetCurrentSlice( this );
     AliHLTTPCCADisplay::Instance().DrawSlice( this, 1 );
     disp.DrawSliceHits( kRed, .5 );
     disp.Ask();
-    for ( int itr = 0; itr < *slice.NOutTracks(); itr++ ) {
+    for ( int itr = 0; itr < slice.NOutTracks(); itr++ ) {
       std::cout << "track N " << itr << ", nhits=" << slice.OutTracks()[itr].NHits() << std::endl;
       disp.DrawSliceOutTrack( itr, kBlue );      
       //disp.Ask();
@@ -523,13 +620,13 @@ GPUh() void AliHLTTPCCATracker::WriteOutput()
 
   //cout<<"output: nTracks = "<<*fNTracks<<", nHitsTotal="<<NHitsTotal()<<std::endl;
 
-  fOutput->SetNTracks( *fNTracks );
-  fOutput->SetNTrackClusters( *fNTrackHits );
-  fOutput->SetPointers();
+  fOutput->SetNTracks( fCommonMem->fNTracks );
+  fOutput->SetNTrackClusters( fCommonMem->fNTrackHits );
+  fOutput->Allocate();
 
   int nStoredHits = 0;
 
-  for ( int iTr = 0; iTr < *fNTracks; iTr++ ) {
+  for ( int iTr = 0; iTr < fCommonMem->fNTracks; iTr++ ) {
     AliHLTTPCCATrack &iTrack = fTracks[iTr];
 
     AliHLTTPCCASliceTrack out;
@@ -601,12 +698,12 @@ GPUh() void AliHLTTPCCATracker::WriteOutput()
 
 
   // old stuff
-
-  *fNOutTrackHits = 0;
-  *fNOutTracks = 0;
+#ifndef HLTCA_STANDALONE
+  fOutput->SetNOutTrackHits(0);
+  fOutput->SetNOutTracks(0);
 
 
-  for ( int iTr = 0; iTr < *fNTracks; iTr++ ) {
+  for ( int iTr = 0; iTr < fCommonMem->fNTracks; iTr++ ) {
 
     const AliHLTTPCCATrack &iTrack = fTracks[iTr];
 
@@ -614,24 +711,24 @@ GPUh() void AliHLTTPCCATracker::WriteOutput()
 
     //if( !iTrack.Alive() ) continue;
     if ( iTrack.NHits() < 3 ) continue;
-    AliHLTTPCCAOutTrack &out = fOutTracks[*fNOutTracks];
-    out.SetFirstHitRef( *fNOutTrackHits );
+    AliHLTTPCCAOutTrack &out = fOutput->OutTracks()[fOutput->NOutTracks()];
+    out.SetFirstHitRef( fOutput->NOutTrackHits() );
     out.SetNHits( 0 );
     out.SetOrigTrackID( iTr );
     out.SetStartPoint( iTrack.Param() );
     out.SetEndPoint( iTrack.Param() );
 
     int iID = iTrack.FirstHitID();
-    int nOutTrackHitsOld = *fNOutTrackHits;
+    int nOutTrackHitsOld = fOutput->NOutTrackHits();
 
     for ( int ith = 0; ith < iTrack.NHits(); ith++ ) {
       const AliHLTTPCCAHitId &ic = fTrackHits[iID + ith];
       const AliHLTTPCCARow &row = Row( ic );
       int ih = ic.HitIndex();
-      fOutTrackHits[*fNOutTrackHits] = HitInputID( row, ih );
-      ( *fNOutTrackHits )++;
+      fOutput->SetOutTrackHit(fOutput->NOutTrackHits(), HitInputID( row, ih ));
+      fOutput->SetNOutTrackHits(fOutput->NOutTrackHits() + 1 );
       //std::cout<<"write i,row,hit,id="<<ith<<", "<<ID2IRow(ic)<<", "<<ih<<", "<<HitInputID( row, ih )<<std::endl;
-      if ( *fNOutTrackHits >= 10*NHitsTotal() ) {
+      if ( fOutput->NOutTrackHits() >= 10*NHitsTotal() ) {
         std::cout << "fNOutTrackHits>NHitsTotal()" << std::endl;
         //exit(0);
         return;//SG!!!
@@ -639,12 +736,12 @@ GPUh() void AliHLTTPCCATracker::WriteOutput()
       out.SetNHits( out.NHits() + 1 );
     }
     if ( out.NHits() >= 2 ) {
-      ( *fNOutTracks )++;
+      fOutput->SetNOutTracks(fOutput->NOutTracks() + 1);
     } else {
-      ( *fNOutTrackHits ) = nOutTrackHitsOld;
+      fOutput->SetNOutTrackHits(nOutTrackHitsOld);
     }
   }
-
+#endif
 
   timer.Stop();
   fTimers[5] += timer.CpuTime();
@@ -830,16 +927,16 @@ GPUh() void AliHLTTPCCATracker::WriteTracks( std::ostream &out )
   //* Write tracks to file
 
   out << fTimers[0] << std::endl;
-  out << *fNOutTrackHits << std::endl;
-  for ( int ih = 0; ih < *fNOutTrackHits; ih++ ) {
-    out << fOutTrackHits[ih] << " ";
+  out << fOutput->NOutTrackHits() << std::endl;
+  for ( int ih = 0; ih < fOutput->NOutTrackHits(); ih++ ) {
+    out << fOutput->OutTrackHit(ih) << " ";
   }
   out << std::endl;
 
-  out << *fNOutTracks << std::endl;
+  out << fOutput->NOutTracks() << std::endl;
 
-  for ( int itr = 0; itr < *fNOutTracks; itr++ ) {
-    AliHLTTPCCAOutTrack &t = fOutTracks[itr];
+  for ( int itr = 0; itr < fOutput->NOutTracks(); itr++ ) {
+    const AliHLTTPCCAOutTrack &t = fOutput->OutTrack(itr);
     AliHLTTPCCATrackParam p1 = t.StartPoint();
     AliHLTTPCCATrackParam p2 = t.EndPoint();
     out << t.NHits() << " ";
@@ -868,16 +965,20 @@ GPUh() void AliHLTTPCCATracker::WriteTracks( std::ostream &out )
 GPUh() void AliHLTTPCCATracker::ReadTracks( std::istream &in )
 {
   //* Read tracks  from file
+       int tmpval;
   in >> fTimers[0];
-  in >> *fNOutTrackHits;
+  in >> tmpval;
+  fOutput->SetNOutTrackHits(tmpval);
 
-  for ( int ih = 0; ih < *fNOutTrackHits; ih++ ) {
-    in >> fOutTrackHits[ih];
+  for ( int ih = 0; ih < fOutput->NOutTrackHits(); ih++ ) {
+    in >> tmpval;
+       fOutput->SetOutTrackHit(ih, tmpval);
   }
-  in >> *fNOutTracks;
+  in >> tmpval;
+  fOutput->SetNOutTracks(tmpval);
 
-  for ( int itr = 0; itr < *fNOutTracks; itr++ ) {
-    AliHLTTPCCAOutTrack &t = fOutTracks[itr];
+  for ( int itr = 0; itr < fOutput->NOutTracks(); itr++ ) {
+    AliHLTTPCCAOutTrack &t = fOutput->OutTracks()[itr];
     AliHLTTPCCATrackParam p1, p2;
     int i;
     float f;
index b6ec633..ca2c287 100644 (file)
@@ -12,6 +12,7 @@
 
 
 #include "AliHLTTPCCADef.h"
+#include "AliHLTTPCCAGPUConfig.h"
 #include "AliHLTTPCCAParam.h"
 #include "AliHLTTPCCARow.h"
 #include "AliHLTTPCCAHit.h"
 #include "AliHLTTPCCASliceData.h"
 #include "AliHLTTPCCATracklet.h"
 #include "AliHLTTPCCAOutTrack.h"
+#include "AliHLTTPCCASliceOutput.h"
+#include "AliHLTTPCCATrackletConstructor.h"
 
 class AliHLTTPCCATrack;
 class AliHLTTPCCATrackParam;
 class AliHLTTPCCAClusterData;
-class AliHLTTPCCASliceOutput;
 
 /**
  * @class AliHLTTPCCATracker
@@ -39,41 +41,65 @@ class AliHLTTPCCASliceOutput;
  * The class is under construction.
  *
  */
+
+class AliHLTTPCCAClusterData;
+
 class AliHLTTPCCATracker
 {
-       friend class AliHLTTPCCAGPUTracker;
   public:
 
        AliHLTTPCCATracker()
-               :
-               fParam(),
+               : fParam(),
                fClusterData( 0 ),
                fData(),
                fIsGPUTracker( false ),
                fGPUDebugLevel( 0 ),
                fGPUDebugOut( 0 ),
-               fCommonMemory( 0 ),
-               fCommonMemorySize( 0 ),
+               fRowStartHitCountOffset( NULL ),
+               fTrackletTmpStartHits( NULL ),
+               fGPUTrackletTemp( NULL ),
+               fRowBlockTracklets( NULL ),
+               fRowBlockPos( NULL ),
+               fBlockStartingTracklet( NULL ),
+               fGPUParametersConst(),
+               fCommonMem( 0 ),
                fHitMemory( 0 ),
                fHitMemorySize( 0 ),
+               fTrackletMemory( 0 ),
+               fTrackletMemorySize( 0 ),
                fTrackMemory( 0 ),
                fTrackMemorySize( 0 ),
-               fNTracklets( 0 ),
                fTrackletStartHits( 0 ),
                fTracklets( 0 ),
-               fNTracks( 0 ),
+               fTrackletRowHits( NULL ),
                fTracks( 0 ),
-               fNTrackHits( 0 ),
                fTrackHits( 0 ),
-               fOutput( 0 ),
-               fNOutTracks( 0 ),
-               fOutTracks( 0 ),
-               fNOutTrackHits( 0 ),
-               fOutTrackHits( 0 )
+               fOutput( 0 )
        {
          // constructor
        }
-    GPUd() ~AliHLTTPCCATracker();
+    ~AliHLTTPCCATracker();
+
+       struct StructGPUParameters
+       {
+               int fScheduleFirstDynamicTracklet;              //Last Tracklet with fixed position in sheduling
+               int fGPUError;                                                  //Signalizes error on GPU during GPU Reconstruction, kind of return value
+       };
+
+       struct StructGPUParametersConst
+       {
+               int fGPUFixedBlockCount;                                //Count of blocks that is used for this tracker in fixed schedule situations
+               int fGPUiSlice;
+               int fGPUnSlices;
+       };
+
+       struct commonMemoryStruct
+       {
+           int fNTracklets;     // number of tracklets
+           int fNTracks;            // number of reconstructed tracks
+           int fNTrackHits;           // number of track hits
+               StructGPUParameters fGPUParameters;
+       };
 
     void Initialize( const AliHLTTPCCAParam &param );
 
@@ -98,12 +124,15 @@ class AliHLTTPCCATracker
        void SetGPUDebugLevel(int Level, std::ostream *NewDebugOut = NULL) {fGPUDebugLevel = Level;if (NewDebugOut) fGPUDebugOut = NewDebugOut;}
 
        char* SetGPUTrackerCommonMemory(char* pGPUMemory);
-       char* SetGPUTrackerHitsMemory(char* pGPUMemory, int MaxNHits );
+       char* SetGPUTrackerHitsMemory(char* pGPUMemory, int MaxNHits);
+       char* SetGPUTrackerTrackletsMemory(char* pGPUMemory, int MaxNTracklets);
        char* SetGPUTrackerTracksMemory(char* pGPUMemory, int MaxNTracks, int MaxNHits );
 
        //Debugging Stuff
+       void DumpSliceData(std::ostream &out);  //Dump Input Slice Data
        void DumpLinks(std::ostream &out);              //Dump all links to file (for comparison after NeighboursFinder/Cleaner)
        void DumpStartHits(std::ostream &out);  //Same for Start Hits
+       void DumpHitWeights(std::ostream &out); //....
        void DumpTrackHits(std::ostream &out);  //Same for Track Hits
        void DumpTrackletHits(std::ostream &out);       //Same for Track Hits
 
@@ -113,13 +142,16 @@ class AliHLTTPCCATracker
     void FitTrack( const AliHLTTPCCATrack &track, float *t0 = 0 ) const;
     void FitTrackFull( const AliHLTTPCCATrack &track, float *t0 = 0 ) const;
 
-       void SetPointersCommon();
+    void SetupCommonMemory();
     void SetPointersHits( int MaxNHits );
+       void SetPointersTracklets ( int MaxNTracklets );
     void SetPointersTracks( int MaxNTracks, int MaxNHits );
 
-#if !defined(HLTCA_GPUCODE)
+       void SetOutput( AliHLTTPCCASliceOutput* out ) { fOutput = out; }
+
     void ReadEvent( AliHLTTPCCAClusterData *clusterData );
 
+#if !defined(HLTCA_GPUCODE)
     GPUh() void WriteEvent( std::ostream &out );
     GPUh() void WriteTracks( std::ostream &out ) ;
     GPUh() void ReadTracks( std::istream &in );
@@ -128,8 +160,12 @@ class AliHLTTPCCATracker
     GPUhd() const AliHLTTPCCAParam &Param() const { return fParam; }
     GPUhd() void SetParam( const AliHLTTPCCAParam &v ) { fParam = v; }
 
-    GPUhd() const AliHLTTPCCAClusterData *ClusterData() const { return fClusterData; }
+    GPUhd() AliHLTTPCCAClusterData *ClusterData() const { return fClusterData; }
     GPUhd() const AliHLTTPCCASliceData &Data() const { return fData; }
+       GPUhd() AliHLTTPCCASliceData *pData() {return &fData; }
+
+       GPUh() void ClearSliceDataHitWeights() {fData.ClearHitWeights();}
+
     GPUhd() const AliHLTTPCCARow &Row( int rowIndex ) const { return fData.Row( rowIndex ); }
     GPUh() const AliHLTTPCCARow &Row( const AliHLTTPCCAHitId &HitId ) const { return fData.Row( HitId.RowIndex() ); }
 
@@ -143,7 +179,12 @@ class AliHLTTPCCATracker
     GPUd() short HitLinkUpData( const AliHLTTPCCARow &row, int hitIndex ) const { return fData.HitLinkUpData( row, hitIndex ); }
     GPUd() short HitLinkDownData( const AliHLTTPCCARow &row, int hitIndex ) const { return fData.HitLinkDownData( row, hitIndex ); }
 
-    GPUd() int FirstHitInBin( const AliHLTTPCCARow &row, int binIndex ) const { return fData.FirstHitInBin( row, binIndex ); }
+       GPUd() const ushort2 *HitData( const AliHLTTPCCARow &row ) const { return fData.HitData(row); }
+       GPUd() const short_v *HitLinkUpData  ( const AliHLTTPCCARow &row ) const { return fData.HitLinkUpData(row); }
+       GPUd() const short_v *HitLinkDownData( const AliHLTTPCCARow &row ) const { return fData.HitLinkDownData(row); }
+       GPUd() const ushort_v *FirstHitInBin( const AliHLTTPCCARow &row ) const { return fData.FirstHitInBin(row); }
+       
+       GPUd() int FirstHitInBin( const AliHLTTPCCARow &row, int binIndex ) const { return fData.FirstHitInBin( row, binIndex ); }
 
     GPUd() unsigned short HitDataY( const AliHLTTPCCARow &row, int hitIndex ) const {
       return fData.HitDataY( row, hitIndex );
@@ -151,6 +192,9 @@ class AliHLTTPCCATracker
     GPUd() unsigned short HitDataZ( const AliHLTTPCCARow &row, int hitIndex ) const {
       return fData.HitDataZ( row, hitIndex );
     }
+    GPUd() ushort2 HitData( const AliHLTTPCCARow &row, int hitIndex ) const {
+      return fData.HitData( row, hitIndex );
+    }
 
     GPUhd() int HitInputID( const AliHLTTPCCARow &row, int hitIndex ) const { return fData.ClusterDataIndex( row, hitIndex ); }
 
@@ -172,83 +216,107 @@ class AliHLTTPCCATracker
       return fData.HitWeight( row, hitIndex );
     }
 
-    GPUhd() int NTracklets() const { return *fNTracklets; }
-    GPUhd() int  *NTracklets() { return fNTracklets; }
+    GPUhd() int NTracklets() const { return fCommonMem->fNTracklets; }
+    GPUhd() int  *NTracklets() { return &fCommonMem->fNTracklets; }
 
     GPUhd() const AliHLTTPCCAHitId &TrackletStartHit( int i ) const { return fTrackletStartHits[i]; }
     GPUhd() AliHLTTPCCAHitId *TrackletStartHits() const { return fTrackletStartHits; }
+    GPUhd() AliHLTTPCCAHitId *TrackletTmpStartHits() const { return fTrackletTmpStartHits; }
     GPUhd() const AliHLTTPCCATracklet &Tracklet( int i ) const { return fTracklets[i]; }
     GPUhd() AliHLTTPCCATracklet  *Tracklets() const { return fTracklets;}
+       GPUhd() int* TrackletRowHits() { return fTrackletRowHits; }
 
-    GPUhd() int *NTracks()  const { return fNTracks; }
+    GPUhd() int *NTracks()  const { return &fCommonMem->fNTracks; }
     GPUhd() AliHLTTPCCATrack *Tracks() const { return  fTracks; }
-    GPUhd() int *NTrackHits()  const { return fNTrackHits; }
+    GPUhd() int *NTrackHits()  const { return &fCommonMem->fNTrackHits; }
     GPUhd() AliHLTTPCCAHitId *TrackHits() const { return fTrackHits; }
 
-    GPUhd() const AliHLTTPCCASliceOutput * Output() const { return fOutput; }
-
-    GPUhd()  int *NOutTracks() const { return  fNOutTracks; }
-    GPUhd()  AliHLTTPCCAOutTrack *OutTracks() const { return  fOutTracks; }
-    GPUhd()  const AliHLTTPCCAOutTrack &OutTrack( int index ) const { return fOutTracks[index]; }
-    GPUhd()  int *NOutTrackHits() const { return  fNOutTrackHits; }
-    GPUhd()  int *OutTrackHits() const { return  fOutTrackHits; }
-    GPUhd()  int OutTrackHit( int i ) const { return  fOutTrackHits[i]; }
-
-#ifndef CUDA_DEVICE_EMULATION
-  private:
+    GPUhd() AliHLTTPCCASliceOutput * Output() const { return fOutput; }
+
+       GPUh() commonMemoryStruct *CommonMemory() {return(fCommonMem); }
+       static GPUh() size_t CommonMemorySize() { return(sizeof(AliHLTTPCCATracker::commonMemoryStruct)); }
+       GPUh() char* &HitMemory() {return(fHitMemory); }
+       GPUh() size_t HitMemorySize() const {return(fHitMemorySize); }
+       GPUh() char* &TrackletMemory() {return(fTrackletMemory); }
+       GPUh() size_t TrackletMemorySize() const {return(fTrackletMemorySize); }
+       GPUh() char* &TrackMemory() {return(fTrackMemory); }
+       GPUh() size_t TrackMemorySize() const {return(fTrackMemorySize); }
+       GPUh() char* OutputMemory() const {return(fOutput->Memory()); }
+       GPUh() size_t OutputMemorySize() const {return(fOutput->MemorySize()); }
+       GPUhd() AliHLTTPCCARow* SliceDataRows() {return(fData.Rows()); }
+
+       GPUhd() uint3* RowStartHitCountOffset() const {return(fRowStartHitCountOffset);}
+       GPUhd() AliHLTTPCCATrackletConstructor::AliHLTTPCCAGPUTempMemory* GPUTrackletTemp() const {return(fGPUTrackletTemp);}
+       GPUhd() int* RowBlockTracklets(int reverse, int iRowBlock) const {return(&fRowBlockTracklets[(reverse * ((fParam.NRows() / HLTCA_GPU_SCHED_ROW_STEP) + 1) + iRowBlock) * fCommonMem->fNTracklets]);}
+       GPUhd() int* RowBlockTracklets() const {return(fRowBlockTracklets);}
+       GPUhd() int4* RowBlockPos(int reverse, int iRowBlock) const {return(&fRowBlockPos[reverse * ((fParam.NRows() / HLTCA_GPU_SCHED_ROW_STEP) + 1) + iRowBlock]);}
+       GPUhd() int4* RowBlockPos() const {return(fRowBlockPos);}
+       GPUhd() uint2* BlockStartingTracklet() const {return(fBlockStartingTracklet);}
+       GPUhd() StructGPUParameters* GPUParameters() const {return(&fCommonMem->fGPUParameters);}
+       GPUhd() StructGPUParametersConst* GPUParametersConst() {return(&fGPUParametersConst);}
+       
+       GPUh() unsigned long long int* PerfTimer(unsigned int i) {return &fPerfTimers[i]; }
+
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+       char* fStageAtSync;                             //Pointer to array storing current stage for every thread at every sync point
+       int* fThreadTimes;
 #endif
 
-    void SetupCommonMemory();
-
+  private:
     AliHLTTPCCAParam fParam; // parameters
-    double fTimers[10]; // running CPU time for different parts of the algorithm
+       double fTimers[10];
+    unsigned long long int fPerfTimers[16]; // running CPU time for different parts of the algorithm
+       void StandalonePerfTime(int i);
 
     /** A pointer to the ClusterData object that the SliceData was created from. This can be used to
      * merge clusters from inside the SliceTracker code and recreate the SliceData. */
     AliHLTTPCCAClusterData *fClusterData; // ^
     AliHLTTPCCASliceData fData; // The SliceData object. It is used to encapsulate the storage in memory from the access
 
-  //Will this tracker run on GPU?
-  bool fIsGPUTracker; // is it GPU tracker
-  int fGPUDebugLevel; // debug level
-  std::ostream *fGPUDebugOut; // debug stream
+       bool fIsGPUTracker; // is it GPU tracker object
+       int fGPUDebugLevel; // debug level
+       std::ostream *fGPUDebugOut; // debug stream
 
-    // event
+       //GPU Temp Arrays
+       uint3* fRowStartHitCountOffset;                         //Offset, length and new offset of start hits in row
+       AliHLTTPCCAHitId *fTrackletTmpStartHits;        //Unsorted start hits
+       AliHLTTPCCATrackletConstructor::AliHLTTPCCAGPUTempMemory *fGPUTrackletTemp;     //Temp Memory for GPU Tracklet Constructor
+       int* fRowBlockTracklets;                                        //Reference which tracklet is processed in which rowblock next
+       int4* fRowBlockPos;                                                     //x is last tracklet to be processed, y is last tracklet already processed, z is last tracklet to be processed in next iteration, w is initial x value to check if tracklet must be initialized
+       uint2* fBlockStartingTracklet;
 
-    char *fCommonMemory; // common event memory
-    size_t   fCommonMemorySize; // size of the event memory [bytes]
+       StructGPUParametersConst fGPUParametersConst;
+
+       // event
+
+       commonMemoryStruct *fCommonMem; // common event memory
 
     char *fHitMemory; // event memory for hits
     size_t   fHitMemorySize; // size of the event memory [bytes]
 
+       char *fTrackletMemory;
+       size_t fTrackletMemorySize;
+
     char *fTrackMemory; // event memory for tracks
     size_t   fTrackMemorySize; // size of the event memory [bytes]
 
-
-    int *fNTracklets;     // number of tracklets
     AliHLTTPCCAHitId *fTrackletStartHits;   // start hits for the tracklets
     AliHLTTPCCATracklet *fTracklets; // tracklets
+       int *fTrackletRowHits;
 
     //
-    int *fNTracks;            // number of reconstructed tracks
     AliHLTTPCCATrack *fTracks;  // reconstructed tracks
-    int *fNTrackHits;           // number of track hits
     AliHLTTPCCAHitId *fTrackHits;          // array of track hit numbers
 
     // output
 
     AliHLTTPCCASliceOutput *fOutput;
 
-    // obsolete output
-
-    int *fNOutTracks; // number of tracks in fOutTracks array
-    AliHLTTPCCAOutTrack *fOutTracks; // output array of the reconstructed tracks
-    int *fNOutTrackHits;  // number of hits in fOutTrackHits array
-    int *fOutTrackHits;  // output array of ID's of the reconstructed hits
-
     // disable copy
     AliHLTTPCCATracker( const AliHLTTPCCATracker& );
     AliHLTTPCCATracker &operator=( const AliHLTTPCCATracker& );
+
+       static int starthitSortComparison(const void*a, const void* b);
 };
 
 
index 451e308..fe9ec4b 100644 (file)
@@ -30,7 +30,7 @@ using namespace std;
 
 #include "AliHLTTPCCATrackerComponent.h"
 #include "AliHLTTPCTransform.h"
-#include "AliHLTTPCCATracker.h"
+#include "AliHLTTPCCATrackerFramework.h"
 #include "AliHLTTPCCAOutTrack.h"
 #include "AliHLTTPCCAParam.h"
 #include "AliHLTTPCCATrackConvertor.h"
@@ -61,6 +61,7 @@ ClassImp( AliHLTTPCCATrackerComponent )
 AliHLTTPCCATrackerComponent::AliHLTTPCCATrackerComponent()
     :
     fTracker( NULL ),
+       fOutput( NULL ),
     fSolenoidBz( 0 ),
     fMinNTrackClusters( 0 ),
     fClusterZCut( 500. ),
@@ -83,6 +84,7 @@ AliHLTTPCCATrackerComponent::AliHLTTPCCATrackerComponent( const AliHLTTPCCATrack
     :
     AliHLTProcessor(),
     fTracker( NULL ),
+       fOutput( NULL),
     fSolenoidBz( 0 ),
     fMinNTrackClusters( 30 ),
     fClusterZCut( 500. ),
@@ -109,6 +111,7 @@ AliHLTTPCCATrackerComponent::~AliHLTTPCCATrackerComponent()
 {
   // see header file for class documentation
   delete fTracker;
+  delete fOutput;
 }
 
 //
@@ -322,9 +325,11 @@ int AliHLTTPCCATrackerComponent::DoInit( int argc, const char** argv )
 {
   // Configure the CA tracker component
 
-  if ( fTracker ) return EINPROGRESS;
+  if ( fTracker || fOutput ) return EINPROGRESS;
 
-  fTracker = new AliHLTTPCCATracker();
+
+  fTracker = new AliHLTTPCCATrackerFramework();
+  fOutput = new AliHLTTPCCASliceOutput();
 
   TString arguments = "";
   for ( int i = 0; i < argc; i++ ) {
@@ -341,6 +346,8 @@ int AliHLTTPCCATrackerComponent::DoDeinit()
   // see header file for class documentation
   delete fTracker;
   fTracker = NULL;
+  delete fOutput;
+  fOutput = NULL;
   return 0;
 }
 
@@ -459,7 +466,8 @@ int AliHLTTPCCATrackerComponent::DoEvent
 
 
   {
-    if ( !fTracker ) fTracker = new AliHLTTPCCATracker;
+    if ( !fTracker ) fTracker = new AliHLTTPCCATrackerFramework;
+       if ( !fOutput ) fOutput = new AliHLTTPCCASliceOutput;
     int iSec = slice;
     float inRmin = 83.65;
     //    float inRmax = 133.3;
@@ -497,7 +505,7 @@ int AliHLTTPCCATrackerComponent::DoEvent
     if( fClusterErrorCorrectionY>1.e-4 ) param.SetClusterError2CorrectionY( fClusterErrorCorrectionY*fClusterErrorCorrectionY );
     if( fClusterErrorCorrectionZ>1.e-4 ) param.SetClusterError2CorrectionZ( fClusterErrorCorrectionZ*fClusterErrorCorrectionZ );
     param.Update();
-    fTracker->Initialize( param );
+    fTracker->InitializeSliceParam( slice, param );
     delete[] rowX;
   }
 
@@ -575,22 +583,20 @@ int AliHLTTPCCATrackerComponent::DoEvent
 
   TStopwatch timerReco;
 
-  fTracker->ReadEvent( &clusterData );
-
-  fTracker->Reconstruct();
+  fTracker->ProcessSlices(slice, 1, &clusterData, fOutput);
 
   timerReco.Stop();
 
   int ret = 0;
 
   Logging( kHLTLogDebug, "HLT::TPCCATracker::DoEvent", "Reconstruct",
-           "%d tracks found for slice %d", fTracker->NOutTracks(), slice );
+           "%d tracks found for slice %d", fOutput->NOutTracks(), slice );
 
 
   // write reconstructed tracks
 
   unsigned int mySize = 0;
-  int ntracks = *fTracker->NOutTracks();
+  int ntracks = fOutput->NOutTracks();
 
 
   if ( fOutputTRAKSEGS ) {
@@ -605,7 +611,7 @@ int AliHLTTPCCATrackerComponent::DoEvent
 
     for ( int itr = 0; itr < ntracks; itr++ ) {
 
-      AliHLTTPCCAOutTrack &t = fTracker->OutTracks()[itr];
+      AliHLTTPCCAOutTrack &t = fOutput->OutTracks()[itr];
 
       //Logging( kHLTLogDebug, "HLT::TPCCATracker::DoEvent", "Wrtite output","track %d with %d hits", itr, t.NHits());
 
@@ -625,10 +631,10 @@ int AliHLTTPCCATrackerComponent::DoEvent
 
       int iFirstRow = 1000;
       int iLastRow = -1;
-      int iFirstHit = fTracker->OutTrackHits()[t.FirstHitRef()];
+      int iFirstHit = fOutput->OutTrackHit(t.FirstHitRef());
       int iLastHit = iFirstHit;
       for ( int ih = 0; ih < t.NHits(); ih++ ) {
-        int hitID = fTracker->OutTrackHits()[t.FirstHitRef() + ih ];
+        int hitID = fOutput->OutTrackHit(t.FirstHitRef() + ih);
         int iRow = clusterData.RowNumber( hitID );
         if ( iRow < iFirstRow ) {  iFirstRow = iRow; iFirstHit = hitID; }
         if ( iRow > iLastRow ) { iLastRow = iRow; iLastHit = hitID; }
@@ -687,7 +693,7 @@ int AliHLTTPCCATrackerComponent::DoEvent
       currOutTracklet->fNPoints = t.NHits();
 
       for ( int i = 0; i < t.NHits(); i++ ) {
-        currOutTracklet->fPointIDs[i] = clusterData.Id( fTracker->OutTrackHits()[t.FirstHitRef()+i] );
+        currOutTracklet->fPointIDs[i] = clusterData.Id( fOutput->OutTrackHit(t.FirstHitRef()+i) );
       }
 
       currOutTracklet = ( AliHLTTPCTrackSegmentData* )( ( Byte_t * )currOutTracklet + dSize );
@@ -697,10 +703,10 @@ int AliHLTTPCCATrackerComponent::DoEvent
 
   } else { // default output type
 
-    mySize = fTracker->Output()->EstimateSize( fTracker->Output()->NTracks(),
-             fTracker->Output()->NTrackClusters() );
+    mySize = fOutput->EstimateSize( fOutput->NTracks(),
+             fOutput->NTrackClusters() );
     if ( mySize <= maxBufferSize ) {
-      const AliHLTUInt8_t* outputevent = reinterpret_cast<const AliHLTUInt8_t*>( fTracker->Output() );
+      const AliHLTUInt8_t* outputevent = reinterpret_cast<const AliHLTUInt8_t*>( fOutput );
       for ( unsigned int i = 0; i < mySize; i++ ) outputPtr[i] = outputevent[i];
     } else {
       HLTWarning( "Output buffer size exceed (buffer size %d, current size %d), tracks are not stored", maxBufferSize, mySize );
index bd780bf..3b88c91 100644 (file)
@@ -12,7 +12,8 @@
 
 #include "AliHLTProcessor.h"
 
-class AliHLTTPCCATracker;
+class AliHLTTPCCATrackerFramework;
+class AliHLTTPCCASliceOutput;
 class AliHLTTPCSpacePointData;
 
 /**
@@ -75,7 +76,8 @@ class AliHLTTPCCATrackerComponent : public AliHLTProcessor
   private:
 
     /** the tracker object */
-    AliHLTTPCCATracker* fTracker;                                //! transient
+    AliHLTTPCCATrackerFramework* fTracker;                                //! transient
+       AliHLTTPCCASliceOutput* fOutput;
 
     /** magnetic field */
     double fSolenoidBz;                                            // see above
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.cxx
new file mode 100644 (file)
index 0000000..ed9040e
--- /dev/null
@@ -0,0 +1,119 @@
+// @(#) $Id: AliHLTTPCCATracker.cxx 34611 2009-09-04 00:22:05Z sgorbuno $
+// **************************************************************************
+// This file is property of and copyright by the ALICE HLT Project          *
+// ALICE Experiment at CERN, All rights reserved.                           *
+//                                                                          *
+// Primary Authors: Sergey Gorbunov <sergey.gorbunov@kip.uni-heidelberg.de> *
+//                  Ivan Kisel <kisel@kip.uni-heidelberg.de>                *
+//                  for The ALICE HLT Project.                              *
+//                                                                          *
+// Permission to use, copy, modify and distribute this software and its     *
+// documentation strictly for non-commercial purposes is hereby granted     *
+// without fee, provided that the above copyright notice appears in all     *
+// copies and that both the copyright notice and this permission notice     *
+// appear in the supporting documentation. The authors make no claims       *
+// about the suitability of this software for any purpose. It is            *
+// provided "as is" without express or implied warranty.                    *
+//                                                                          *
+//***************************************************************************
+
+#include "AliHLTTPCCADef.h"
+#include "AliHLTTPCCAGPUConfig.h"
+#include "AliHLTTPCCATrackerFramework.h"
+#include "AliHLTTPCCAGPUTracker.h"
+#include "AliHLTTPCCATracker.h"
+#include "AliHLTTPCCAMath.h"
+#include "AliHLTTPCCAClusterData.h"
+
+#ifdef HLTCA_STANDALONE
+#include <omp.h>
+#endif
+
+int AliHLTTPCCATrackerFramework::InitGPU(int sliceCount, int forceDeviceID)
+{
+       //Initialize GPU Tracker and determine if GPU available
+       int retVal;
+       if (fGPUTrackerAvailable && (retVal = ExitGPU())) return(retVal);
+       retVal = fGPUTracker.InitGPU(sliceCount, forceDeviceID);
+       fUseGPUTracker = fGPUTrackerAvailable = retVal == 0;
+       fGPUSliceCount = sliceCount;
+       return(retVal);
+}
+
+int AliHLTTPCCATrackerFramework::ExitGPU()
+{
+       //Uninitialize GPU Tracker
+       if (!fGPUTrackerAvailable) return(0);
+       fUseGPUTracker = false;
+       fGPUTrackerAvailable = false;
+       return(fGPUTracker.ExitGPU());
+}
+
+void AliHLTTPCCATrackerFramework::SetGPUDebugLevel(int Level, std::ostream *OutFile, std::ostream *GPUOutFile)
+{
+       //Set Debug Level for GPU Tracker and also for CPU Tracker for comparison reasons
+       fGPUTracker.SetDebugLevel(Level, GPUOutFile);
+       fGPUDebugLevel = Level;
+       for (int i = 0;i < fgkNSlices;i++)
+       {
+               fCPUTrackers[i].SetGPUDebugLevel(Level, OutFile);
+       }
+}
+
+int AliHLTTPCCATrackerFramework::SetGPUTracker(bool enable)
+{
+       //Enable / disable GPU Tracker
+       if (enable && !fGPUTrackerAvailable)
+       {
+               fUseGPUTracker = false;
+               return(1);
+       }
+       fUseGPUTracker = enable;
+       return(0);
+}
+
+int AliHLTTPCCATrackerFramework::ProcessSlices(int firstSlice, int sliceCount, AliHLTTPCCAClusterData* pClusterData, AliHLTTPCCASliceOutput* pOutput)
+{
+       //Process sliceCount slices starting from firstslice, in is pClusterData array, out pOutput array
+       if (fUseGPUTracker)
+       {
+               if (fGPUTracker.Reconstruct(pOutput, pClusterData, firstSlice, CAMath::Min(sliceCount, fgkNSlices - firstSlice))) return(1);
+       }
+       else
+       {
+#ifdef HLTCA_STANDALONE
+#pragma omp parallel for
+#endif
+               for (int iSlice = 0;iSlice < CAMath::Min(sliceCount, fgkNSlices - firstSlice);iSlice++)
+               {
+                       fCPUTrackers[firstSlice + iSlice].ReadEvent(&pClusterData[iSlice]);
+                       fCPUTrackers[firstSlice + iSlice].SetOutput(&pOutput[iSlice]);
+                       fCPUTrackers[firstSlice + iSlice].Reconstruct();
+                       fCPUTrackers[firstSlice + iSlice].SetupCommonMemory();
+               }
+       }
+       
+       if (fGPUDebugLevel >= 6 && fUseGPUTracker)
+       {
+           fUseGPUTracker = 0;
+           ProcessSlices(firstSlice, sliceCount, pClusterData, pOutput);
+           fUseGPUTracker = 1;
+       }
+
+       //printf("Slice Tracks Output: %d\n", pOutput[0].NTracks());
+       return(0);
+}
+
+unsigned long long int* AliHLTTPCCATrackerFramework::PerfTimer(int GPU, int iSlice, int iTimer)
+{
+       //Performance information for slice trackers
+       return(GPU ? fGPUTracker.PerfTimer(iSlice, iTimer) : fCPUTrackers[iSlice].PerfTimer(iTimer));
+}
+
+int AliHLTTPCCATrackerFramework::InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param)
+{
+       //Initialize Tracker Parameters for a slice
+       if (fGPUTrackerAvailable && fGPUTracker.InitializeSliceParam(iSlice, param)) return(1);
+       fCPUTrackers[iSlice].Initialize(param);
+       return(0);
+}
\ No newline at end of file
diff --git a/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.h b/HLT/TPCLib/tracking-ca/AliHLTTPCCATrackerFramework.h
new file mode 100644 (file)
index 0000000..9cace75
--- /dev/null
@@ -0,0 +1,66 @@
+//-*- Mode: C++ -*-
+// @(#) $Id: AliHLTTPCCATracker.h 33907 2009-07-23 13:52:49Z sgorbuno $
+// ************************************************************************
+// This file is property of and copyright by the ALICE HLT Project        *
+// ALICE Experiment at CERN, All rights reserved.                         *
+// See cxx source for full Copyright notice                               *
+//                                                                        *
+//*************************************************************************
+
+#ifndef ALIHLTTPCCATRACKERFRAMEWORK_H
+#define ALIHLTTPCCATRACKERFRAMEWORK_H
+
+#include "AliHLTTPCCATracker.h"
+#include "AliHLTTPCCAGPUTracker.h"
+#include "AliHLTTPCCAParam.h"
+#include <iostream>
+
+class AliHLTTPCCASliceOutput;
+class AliHLTTPCCAClusterData;
+
+class AliHLTTPCCATrackerFramework
+{
+public:
+       AliHLTTPCCATrackerFramework() :
+         fGPUTrackerAvailable(false), fUseGPUTracker(false), fGPUDebugLevel(0), fGPUSliceCount(0), fGPUTracker(), fCPUSliceCount(fgkNSlices)
+         {
+           fGPUTrackerAvailable = fUseGPUTracker = fGPUSliceCount = (fGPUTracker.InitGPU(1, -1) == 0);
+         }
+    ~AliHLTTPCCATrackerFramework()
+         {}
+
+       int InitGPU(int sliceCount = 1, int forceDeviceID = -1);
+       int ExitGPU();
+       void SetGPUDebugLevel(int Level, std::ostream *OutFile = NULL, std::ostream *GPUOutFile = NULL);
+       int SetGPUTrackerOption(char* OptionName, int OptionValue) {return(fGPUTracker.SetGPUTrackerOption(OptionName, OptionValue));}
+       int SetGPUTracker(bool enable);
+
+       int InitializeSliceParam(int iSlice, AliHLTTPCCAParam &param);
+
+       int ProcessSlices(int firstSlice, int sliceCount, AliHLTTPCCAClusterData* pClusterData, AliHLTTPCCASliceOutput* pOutput);
+       unsigned long long int* PerfTimer(int GPU, int iSlice, int iTimer);
+
+       int MaxSliceCount() const { return(fUseGPUTracker ? fGPUSliceCount : fCPUSliceCount); }
+       int GetGPUStatus() const { return(fGPUTrackerAvailable + fUseGPUTracker); }
+
+       const AliHLTTPCCAParam& Param(int iSlice) const { return(fCPUTrackers[iSlice].Param()); }
+       const AliHLTTPCCARow& Row(int iSlice, int iRow) const { return(fCPUTrackers[iSlice].Row(iRow)); }  //TODO: Should be changed to return only row parameters
+
+private:
+  static const int fgkNSlices = 36;       //* N slices
+
+  bool fGPUTrackerAvailable; // Is the GPU Tracker Available?
+  bool fUseGPUTracker; // use the GPU tracker 
+  int fGPUDebugLevel;  // debug level for the GPU code
+  int fGPUSliceCount;  //How many slices to process parallel
+  AliHLTTPCCAGPUTracker fGPUTracker;
+
+  AliHLTTPCCATracker fCPUTrackers[fgkNSlices];
+  int fCPUSliceCount;
+
+  AliHLTTPCCATrackerFramework( const AliHLTTPCCATrackerFramework& );
+  AliHLTTPCCATrackerFramework &operator=( const AliHLTTPCCATrackerFramework& );
+
+};
+
+#endif
index 5871c81..0fa7aa7 100644 (file)
@@ -11,6 +11,7 @@
 
 #include "AliHLTTPCCADef.h"
 #include "AliHLTTPCCATrackParam.h"
+#include "AliHLTTPCCAGPUConfig.h"
 
 /**
  * @class ALIHLTTPCCATracklet
@@ -23,35 +24,36 @@ class AliHLTTPCCATracklet
   public:
 
 #if !defined(HLTCA_GPUCODE)
-    AliHLTTPCCATracklet() : fStartHitID( 0 ), fNHits( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fParam() {};
+    AliHLTTPCCATracklet() : fNHits( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fParam() {};
     void Dummy() const ;
     ~AliHLTTPCCATracklet() {}
 #endif
 
-    GPUhd() int StartHitID()            const { return fStartHitID; }
     GPUhd() int  NHits()                const { return fNHits;      }
     GPUhd() int  FirstRow()             const { return fFirstRow;   }
     GPUhd() int  LastRow()              const { return fLastRow;    }
-    GPUhd() const AliHLTTPCCATrackParam &Param() const { return fParam;      }
+    GPUhd() const AliHLTTPCCATrackParam &Param() const { return fParam; }
+#ifndef EXTERN_ROW_HITS
     GPUhd() int  RowHit( int i )   const { return fRowHits[i];    }
+       GPUhd() const int* RowHits()    const                   { return(fRowHits); }
+#endif
 
-    GPUhd() void SetStartHitID( int v )           { fStartHitID = v; }
     GPUhd() void SetNHits( int v )               {  fNHits = v;      }
     GPUhd() void SetFirstRow( int v )            {  fFirstRow = v;   }
     GPUhd() void SetLastRow( int v )             {  fLastRow = v;    }
     GPUhd() void SetParam( const AliHLTTPCCATrackParam &v ) { fParam = v;      }
+#ifndef EXTERN_ROW_HITS
     GPUhd() void SetRowHit( int irow, int ih )  { fRowHits[irow] = ih;    }
-
-#ifndef CUDA_DEVICE_EMULATION
-  private:
 #endif
 
-    int fStartHitID;            // ID of the starting hit
+  private:
     int fNHits;                 // N hits
     int fFirstRow;              // first TPC row
     int fLastRow;               // last TPC row
     AliHLTTPCCATrackParam fParam; // tracklet parameters
-    int fRowHits[160];          // hit index for each TPC row
+#ifndef EXTERN_ROW_HITS
+    int fRowHits[HLTCA_ROW_COUNT + 1];          // hit index for each TPC row
+#endif
 };
 
 #endif
index 4d6ca14..420d387 100644 (file)
 #include "AliHLTTPCCATrackParam.h"
 #include "AliHLTTPCCATrackParam.h"
 #include "AliHLTTPCCAGrid.h"
-#include "AliHLTTPCCAHitArea.h"
 #include "AliHLTTPCCAMath.h"
 #include "AliHLTTPCCADef.h"
 #include "AliHLTTPCCATracklet.h"
 #include "AliHLTTPCCATrackletConstructor.h"
+#include "MemoryAssignmentHelpers.h"
+
 //#include "AliHLTTPCCAPerformance.h"
 //#include "TH1D.h"
 
 #include "AliHLTTPCCADisplay.h"
 #endif
 
+#define kMaxRowGap 4
 
-GPUd() void AliHLTTPCCATrackletConstructor::Step0
-( int nBlocks, int /*nThreads*/, int iBlock, int iThread,
-  AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &/*tParam*/ )
-{
-  // reconstruction of tracklets, step 0
-
-  r.fIsMemThread = ( iThread < TRACKLET_CONSTRUCTOR_NMEMTHREDS );
-  if ( iThread == 0 ) {
-    int nTracks = *tracker.NTracklets();
-    int nTrPerBlock = nTracks / nBlocks + 1;
-    s.fNRows = tracker.Param().NRows();
-    s.fItr0 = nTrPerBlock * iBlock;
-    s.fItr1 = s.fItr0 + nTrPerBlock;
-    if ( s.fItr1 > nTracks ) s.fItr1 = nTracks;
-    s.fMinStartRow = 158;
-    s.fMaxEndRow = 0;
-  }
-  if ( iThread < 32 ) {
-    s.fMinStartRow32[iThread] = 158;
-  }
-}
-
-
-GPUd() void AliHLTTPCCATrackletConstructor::Step1
-( int /*nBlocks*/, int /*nThreads*/, int /*iBlock*/, int iThread,
-  AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam )
+GPUd() void AliHLTTPCCATrackletConstructor::InitTracklet( AliHLTTPCCATrackParam &tParam )
 {
-  // reconstruction of tracklets, step 1
-
-  r.fItr = s.fItr0 + ( iThread - TRACKLET_CONSTRUCTOR_NMEMTHREDS );
-  r.fGo = ( !r.fIsMemThread ) && ( r.fItr < s.fItr1 );
-  r.fSave = r.fGo;
-  r.fNHits = 0;
-
-  if ( !r.fGo ) return;
-
-  r.fStage = 0;
-
-  AliHLTTPCCATracklet &tracklet = tracker.Tracklets()[r.fItr];
-
-  unsigned int kThread = iThread % 32;//& 00000020;
-  if ( SAVE() ) for ( int i = 0; i < 160; i++ ) tracklet.SetRowHit( i, -1 );
-
-  AliHLTTPCCAHitId id = tracker.TrackletStartHits()[r.fItr];
-  r.fStartRow = id.RowIndex();
-  r.fEndRow = r.fStartRow;
-  r.fFirstRow = r.fStartRow;
-  r.fLastRow = r.fFirstRow;
-  r.fCurrIH =  id.HitIndex();
-
-  CAMath::AtomicMin( &s.fMinStartRow32[kThread], r.fStartRow );
+       //Initialize Tracklet Parameters using default values
   tParam.SetSinPhi( 0 );
   tParam.SetDzDs( 0 );
   tParam.SetQPt( 0 );
@@ -108,66 +62,110 @@ GPUd() void AliHLTTPCCATrackletConstructor::Step1
   tParam.SetCov( 12, 0 );
   tParam.SetCov( 13, 0 );
   tParam.SetCov( 14, 10. );
-
-}
-
-GPUd() void AliHLTTPCCATrackletConstructor::Step2
-( int /*nBlocks*/, int nThreads, int /*iBlock*/, int iThread,
-  AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &/*r*/, AliHLTTPCCATracker &/*tracker*/, AliHLTTPCCATrackParam &/*tParam*/ )
-{
-  // reconstruction of tracklets, step 2
-
-  if ( iThread == 0 ) {
-    //CAMath::AtomicMinGPU(&s.fMinRow, s.fMinRow32[iThread]);
-    int minStartRow = 158;
-    int n = ( nThreads > 32 ) ? 32 : nThreads;
-    for ( int i = 0; i < n; i++ ) {
-      if ( s.fMinStartRow32[i] < minStartRow ) minStartRow = s.fMinStartRow32[i];
-    }
-    s.fMinStartRow = minStartRow;
-  }
 }
 
 GPUd() void AliHLTTPCCATrackletConstructor::ReadData
-( int iThread, AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, int iRow )
+#ifndef HLTCA_GPU_PREFETCHDATA
+( int /*iThread*/, AliHLTTPCCASharedMemory& /*s*/, AliHLTTPCCAThreadMemory& /*r*/, AliHLTTPCCATracker& /*tracker*/, int /*iRow*/ )
+{
+       //Prefetch Data to shared memory
+#else
+( int iThread, AliHLTTPCCASharedMemory& s, AliHLTTPCCAThreadMemory& r, AliHLTTPCCATracker& tracker, int iRow )
 {
   // reconstruction of tracklets, read data step
-
-  if ( r.fIsMemThread ) {
     const AliHLTTPCCARow &row = tracker.Row( iRow );
-    bool jr = !r.fCurrentData;
+    //bool jr = !r.fCurrentData;
 
     // copy hits, grid content and links
 
     // FIXME: inefficient copy
-    const int numberOfHits = row.NHits();
+    //const int numberOfHitsAligned = NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits());
+
+/*     
+#ifdef HLTCA_GPU_REORDERHITDATA
     ushort2 *sMem1 = reinterpret_cast<ushort2 *>( s.fData[jr] );
-    for ( int i = iThread; i < numberOfHits; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
+    for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
       sMem1[i].x = tracker.HitDataY( row, i );
       sMem1[i].y = tracker.HitDataZ( row, i );
     }
-    short *sMem2 = reinterpret_cast<short *>( s.fData[jr] ) + 2 * numberOfHits;
-    for ( int i = iThread; i < numberOfHits; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
-      sMem2[i] = tracker.HitLinkUpData( row, i );
+#else
+    ushort_v *sMem1 = reinterpret_cast<ushort_v *>( s.fData[jr] );
+    for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
+      sMem1[i] = tracker.HitDataY( row, i );
+    }
+
+    ushort_v *sMem1a = reinterpret_cast<ushort_v *>( s.fData[jr] ) + numberOfHitsAligned;
+    for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
+      sMem1a[i] = tracker.HitDataZ( row, i );
     }
+#endif
 
-    unsigned short *sMem3 = reinterpret_cast<unsigned short *>( s.fData[jr] ) + 3 * numberOfHits;
+    short *sMem2 = reinterpret_cast<short *>( s.fData[jr] ) + 2 * numberOfHitsAligned;
+    for ( int i = iThread; i < numberOfHitsAligned; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
+      sMem2[i] = tracker.HitLinkUpData( row, i );
+    }
+       
+    unsigned short *sMem3 = reinterpret_cast<unsigned short *>( s.fData[jr] ) + 3 * numberOfHitsAligned;
     const int n = row.FullSize(); // + grid content size
     for ( int i = iThread; i < n; i += TRACKLET_CONSTRUCTOR_NMEMTHREDS ) {
       sMem3[i] = tracker.FirstHitInBin( row, i );
-    }
-  }
+    }*/
+
+       /*for (int k = 0;k < 2;k++)
+       {
+               HLTCA_GPU_ROWCOPY* sharedMem;
+               const HLTCA_GPU_ROWCOPY* sourceMem;
+               int copyCount;
+               switch (k)
+               {
+               case 0:
+                       sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.HitDataY(row) );
+                       sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
+                       copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
+                       break;
+               case 1:
+                       sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.HitDataZ(row) );
+                       sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
+                       copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
+                       break;
+               case 2:
+                       sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.HitLinkUpData(row) );
+                       sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
+                       copyCount = numberOfHitsAligned * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
+                       break;
+               case 1:
+                       sourceMem = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.FirstHitInBin(row) );
+                       sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[jr] ) + k * numberOfHitsAligned);
+                       copyCount = NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.FullSize()) * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);
+                       break;
+               }
+               for (int i = iThread;i < copyCount;i += TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+               {
+                       sharedMem[i] = sourceMem[i];
+               }
+       }*/
+
+       for (unsigned int i = iThread;i < row.FullSize() * sizeof(ushort_v) / sizeof(HLTCA_GPU_ROWCOPY);i += TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+       {
+               reinterpret_cast<HLTCA_GPU_ROWCOPY *> (reinterpret_cast<ushort_v *>( s.fData[!r.fCurrentData] ))[i] = reinterpret_cast<const HLTCA_GPU_ROWCOPY *>( tracker.FirstHitInBin(row) )[i];
+       }
+
+       const HLTCA_GPU_ROWCOPY* const sourceMem = (const HLTCA_GPU_ROWCOPY *) &row;
+       HLTCA_GPU_ROWCOPY* const sharedMem = reinterpret_cast<HLTCA_GPU_ROWCOPY *> ( &s.fRow[!r.fCurrentData] );
+       for (unsigned int i = iThread;i < sizeof(AliHLTTPCCARow) / sizeof(HLTCA_GPU_ROWCOPY);i += TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+       {
+               sharedMem[i] = sourceMem[i];
+       }
+#endif
 }
 
 
 GPUd() void AliHLTTPCCATrackletConstructor::StoreTracklet
 ( int /*nBlocks*/, int /*nThreads*/, int /*iBlock*/, int /*iThread*/,
-  AliHLTTPCCASharedMemory &/*s*/, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam )
+  AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam )
 {
   // reconstruction of tracklets, tracklet store step
 
-  if ( !r.fSave ) return;
-
   //AliHLTTPCCAPerformance::Instance().HNHitsPerTrackCand()->Fill(r.fNHits);
 
   do {
@@ -175,7 +173,7 @@ GPUd() void AliHLTTPCCATrackletConstructor::StoreTracklet
        //std::cout<<"tracklet to store: "<<r.fItr<<", nhits = "<<r.fNHits<<std::endl;
     }
 
-    if ( r.fNHits < 5 ) {
+    if ( r.fNHits < TRACKLET_SELECTOR_MIN_HITS ) {
       r.fNHits = 0;
       break;
     }
@@ -219,17 +217,27 @@ GPUd() void AliHLTTPCCATrackletConstructor::StoreTracklet
     }
 #endif
     if ( CAMath::Abs( tParam.Par()[4] ) < 1.e-4 ) tParam.SetPar( 4, 1.e-4 );
-    tracklet.SetFirstRow( CAMath::Min(r.fFirstRow, r.fStartRow ) );
+       if (r.fStartRow < r.fFirstRow) r.fFirstRow = r.fStartRow;
+       tracklet.SetFirstRow( r.fFirstRow );
     tracklet.SetLastRow( r.fLastRow );
     tracklet.SetParam( tParam );
     int w = ( r.fNHits << 16 ) + r.fItr;
-    for ( int iRow = 0; iRow < 160; iRow++ ) {
-      int ih = tracklet.RowHit( iRow );
+    for ( int iRow = r.fFirstRow; iRow <= r.fLastRow; iRow++ ) {
+#ifdef EXTERN_ROW_HITS
+      int ih = tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr];
+#else
+         int ih = tracklet.RowHit( iRow );
+#endif
       if ( ih >= 0 ) {
-        tracker.MaximizeHitWeight( tracker.Row( iRow ), ih, w );
+#if defined(HLTCA_GPUCODE) & !defined(HLTCA_GPU_PREFETCHDATA) & !defined(HLTCA_GPU_PREFETCH_ROWBLOCK_ONLY)
+           tracker.MaximizeHitWeight( s.fRows[ iRow ], ih, w );
+#else
+           tracker.MaximizeHitWeight( tracker.Row( iRow ), ih, w );
+#endif
       }
     }
   }
+
 }
 
 GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
@@ -245,11 +253,17 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
 
   if ( !r.fGo ) return;
 
-  const int kMaxRowGap = 4;
-
+#ifndef EXTERN_ROW_HITS
   AliHLTTPCCATracklet &tracklet = tracker.Tracklets()[r.fItr];
+#endif
 
+#ifdef HLTCA_GPU_PREFETCHDATA
+  const AliHLTTPCCARow &row = s.fRow[r.fCurrentData];
+#elif defined(HLTCA_GPUCODE)
+  const AliHLTTPCCARow &row = s.fRows[iRow];
+#else
   const AliHLTTPCCARow &row = tracker.Row( iRow );
+#endif
 
   float y0 = row.Grid().YMin();
   float stepY = row.HstepY();
@@ -263,13 +277,45 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
 
       if ( iRow < r.fStartRow || r.fCurrIH < 0  ) break;
 
-      if ( ( iRow - r.fStartRow ) % 2 != 0 ) break; // SG!!! - jump over the row
-
-      uint4 *tmpint4 = s.fData[r.fCurrentData];
-      ushort2 hh = reinterpret_cast<ushort2*>( tmpint4 )[r.fCurrIH];
+      if ( ( iRow - r.fStartRow ) % 2 != 0 )
+         {
+#ifndef EXTERN_ROW_HITS
+                 tracklet.SetRowHit(iRow, -1);
+#else
+                 tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = -1;
+#endif
+                 break; // SG!!! - jump over the row
+         }
+
+//#ifdef HLTCA_GPU_PREFETCHDATA
+//      uint4 *tmpint4 = s.fData[r.fCurrentData];
+//#endif
+         ushort2 hh;
+//#ifdef HLTCA_GPU_REORDERHITDATA
+//      hh = reinterpret_cast<ushort2*>( tmpint4 )[r.fCurrIH];
+//#else
+//#ifdef HLTCA_GPU_PREFETCHDATA
+//       hh.x = reinterpret_cast<ushort_v*>( tmpint4 )[r.fCurrIH];
+//       hh.y = reinterpret_cast<ushort_v*>( tmpint4 )[NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits()) + r.fCurrIH];
+//#else
+#if defined(HLTCA_GPU_TEXTURE_FETCH)
+         hh = tex1Dfetch(gAliTexRefu2, ((char*) tracker.Data().HitData() - tracker.pData()->GPUTextureBase()) / sizeof(ushort2) + row.HitNumberOffset() + r.fCurrIH);
+#else
+         hh = tracker.HitData(row)[r.fCurrIH];
+#endif
+//#endif
+//#endif
 
       int oldIH = r.fCurrIH;
-      r.fCurrIH = reinterpret_cast<short*>( tmpint4 )[2 * row.NHits() + r.fCurrIH]; // read from linkup data
+//#ifdef HLTCA_GPU_PREFETCHDATA
+//      r.fCurrIH = reinterpret_cast<short*>( tmpint4 )[2 * NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits()) + r.fCurrIH]; // read from linkup data
+//#else
+#if defined(HLTCA_GPU_TEXTURE_FETCH)
+         r.fCurrIH = tex1Dfetch(gAliTexRefs, ((char*) tracker.Data().HitLinkUpData(row) - tracker.pData()->GPUTextureBase()) / sizeof(unsigned short) + r.fCurrIH);
+#else
+         r.fCurrIH = tracker.HitLinkUpData(row)[r.fCurrIH]; // read from linkup data
+#endif
+//#endif
 
       float x = row.X();
       float y = y0 + hh.x * stepY;
@@ -329,7 +375,11 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
           #ifdef DRAW
           if ( drawFit ) std::cout << " tracklet " << r.fItr << ", row " << iRow << ": can not transport!!" << std::endl;
                  #endif
-          if ( SAVE() ) tracklet.SetRowHit( iRow, -1 );
+#ifndef EXTERN_ROW_HITS
+          tracklet.SetRowHit( iRow, -1 );
+#else
+                 tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = -1;
+#endif
           break;
         }
         //std::cout<<"mark1 "<<r.fItr<<std::endl;
@@ -350,11 +400,19 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
           #ifdef DRAW
           if ( drawFit ) std::cout << " tracklet " << r.fItr << ", row " << iRow << ": can not filter!!" << std::endl;
           #endif
-          if ( SAVE() ) tracklet.SetRowHit( iRow, -1 );
+#ifndef EXTERN_ROW_HITS
+          tracklet.SetRowHit( iRow, -1 );
+#else
+                 tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = -1;
+#endif
           break;
         }
       }
-      if ( SAVE() ) tracklet.SetRowHit( iRow, oldIH );
+#ifndef EXTERN_ROW_HITS
+      tracklet.SetRowHit( iRow, oldIH );
+#else
+         tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = oldIH;
+#endif
       if ( drawFit ) {
         #ifdef DRAW
         std::cout << "fit tracklet after filter " << r.fItr << ", row " << iRow << std::endl;
@@ -392,9 +450,6 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
         AliHLTTPCCADisplay::Instance().Ask();
                #endif
       }
-      if ( r.fGo ) {
-        CAMath::AtomicMax( &s.fMaxEndRow, r.fEndRow - 1 );
-      }
     }
   } else { // forward/backward searching part
     do {
@@ -425,10 +480,20 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
         #ifdef DRAW
         if ( drawSearch ) std::cout << " tracklet " << r.fItr << ", row " << iRow << ": can not transport!!" << std::endl;
         #endif
+#ifndef EXTERN_ROW_HITS
+               tracklet.SetRowHit(iRow, -1);
+#else
+               tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = -1;
+#endif
         break;
       }
       if ( row.NHits() < 1 ) {
         // skip empty row
+#ifndef EXTERN_ROW_HITS
+                 tracklet.SetRowHit(iRow, -1);
+#else
+                 tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = -1;
+#endif
         break;
       }
       if ( drawSearch ) {
@@ -439,9 +504,22 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
         AliHLTTPCCADisplay::Instance().Ask();
                #endif
       }
+#ifdef HLTCA_GPU_PREFETCHDATA
       uint4 *tmpint4 = s.fData[r.fCurrentData];
+#endif
 
-      ushort2 *hits = reinterpret_cast<ushort2*>( tmpint4 );
+//#ifdef HLTCA_GPU_REORDERHITDATA
+//      const ushort2 *hits = reinterpret_cast<ushort2*>( tmpint4 );
+//#else
+//#ifdef HLTCA_GPU_PREFETCHDATA
+//       const ushort_v *hitsx = reinterpret_cast<ushort_v*>( tmpint4 );
+//       const ushort_v *hitsy = reinterpret_cast<ushort_v*>( tmpint4 ) + NextMultipleOf<sizeof(HLTCA_GPU_ROWALIGNMENT) / sizeof(ushort_v)>(row.NHits());
+//#else
+#ifndef HLTCA_GPU_TEXTURE_FETCH
+         const ushort2 *hits = tracker.HitData(row);
+#endif
+//#endif
+//#endif
 
       float fY = tParam.GetY();
       float fZ = tParam.GetZ();
@@ -468,15 +546,29 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
         {
           int nY = row.Grid().Ny();
 
-          unsigned short *sGridP = ( reinterpret_cast<unsigned short*>( tmpint4 ) ) + 3 * row.NHits();
+//#ifdef HLTCA_GPU_PREFETCHDATA
+//               const unsigned short *sGridP = ( reinterpret_cast<unsigned short*>( tmpint4 ) );
+//#else
+#ifndef HLTCA_GPU_TEXTURE_FETCH
+                 const unsigned short *sGridP = tracker.FirstHitInBin(row);
+#endif
+//#endif
+
+#ifdef HLTCA_GPU_TEXTURE_FETCH
+                 fHitYfst = tex1Dfetch(gAliTexRefu, ((char*) tracker.Data().FirstHitInBin(row) - tracker.pData()->GPUTextureBase()) / sizeof(unsigned short) + fIndYmin);
+                 fHitYlst = tex1Dfetch(gAliTexRefu, ((char*) tracker.Data().FirstHitInBin(row) - tracker.pData()->GPUTextureBase()) / sizeof(unsigned short) + fIndYmin+2);
+                 fHitYfst1 = tex1Dfetch(gAliTexRefu, ((char*) tracker.Data().FirstHitInBin(row) - tracker.pData()->GPUTextureBase()) / sizeof(unsigned short) + fIndYmin+nY);
+                 fHitYlst1 = tex1Dfetch(gAliTexRefu, ((char*) tracker.Data().FirstHitInBin(row) - tracker.pData()->GPUTextureBase()) / sizeof(unsigned short) + fIndYmin+nY+2);
+#else
           fHitYfst = sGridP[fIndYmin];
           fHitYlst = sGridP[fIndYmin+2];
           fHitYfst1 = sGridP[fIndYmin+nY];
           fHitYlst1 = sGridP[fIndYmin+nY+2];
-          assert( fHitYfst <= row.NHits() );
-          assert( fHitYlst <= row.NHits() );
-          assert( fHitYfst1 <= row.NHits() );
-          assert( fHitYlst1 <= row.NHits() );
+#endif
+          assert( (signed) fHitYfst <= row.NHits() );
+          assert( (signed) fHitYlst <= row.NHits() );
+          assert( (signed) fHitYfst1 <= row.NHits() );
+          assert( (signed) fHitYlst1 <= row.NHits() );
           if ( drawSearch ) {
 #ifdef DRAW
             std::cout << " Grid, row " << iRow << ": nHits=" << row.NHits() << ", grid n=" << row.Grid().N() << ", c[n]=" << sGridP[row.Grid().N()] << std::endl;
@@ -494,22 +586,27 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
             }
 #endif
           }
-          if ( sGridP[row.Grid().N()] != row.NHits() ) {
 #ifdef DRAW
+          if ( sGridP[row.Grid().N()] != row.NHits() ) {
             std::cout << " grid, row " << iRow << ": nHits=" << row.NHits() << ", grid n=" << row.Grid().N() << ", c[n]=" << sGridP[row.Grid().N()] << std::endl;
             //exit(0);
-#endif
           }
+#endif
         }
+#ifdef DRAW
         if ( drawSearch ) {
-          #ifdef DRAW
           std::cout << " tracklet " << r.fItr << ", row " << iRow << ", yz= " << fY << "," << fZ << ": search hits=" << fHitYfst << " " << fHitYlst << " / " << fHitYfst1 << " " << fHitYlst1 << std::endl;
           std::cout << " hit search :" << std::endl;
-          #endif
         }
+#endif
         for ( unsigned int fIh = fHitYfst; fIh < fHitYlst; fIh++ ) {
-          assert( fIh < row.NHits() );
-          ushort2 hh = hits[fIh];
+          assert( (signed) fIh < row.NHits() );
+          ushort2 hh;
+#if defined(HLTCA_GPU_TEXTURE_FETCH)
+                hh = tex1Dfetch(gAliTexRefu2, ((char*) tracker.Data().HitData() - tracker.pData()->GPUTextureBase()) / sizeof(ushort2) + row.HitNumberOffset() + fIh);
+#else
+                 hh = hits[fIh];
+#endif
           int ddy = ( int )( hh.x ) - fY0;
           int ddz = ( int )( hh.y ) - fZ0;
           int dds = CAMath::Abs( ddy ) + CAMath::Abs( ddz );
@@ -524,8 +621,13 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
           }
         }
 
-        for ( unsigned int fIh = fHitYfst1; fIh < fHitYlst1; fIh++ ) {
-          ushort2 hh = hits[fIh];
+               for ( unsigned int fIh = fHitYfst1; fIh < fHitYlst1; fIh++ ) {
+          ushort2 hh;
+#if defined(HLTCA_GPU_TEXTURE_FETCH)
+                 hh = tex1Dfetch(gAliTexRefu2, ((char*) tracker.Data().HitData() - tracker.pData()->GPUTextureBase()) / sizeof(ushort2) + row.HitNumberOffset() + fIh);
+#else
+                 hh = hits[fIh];
+#endif
           int ddy = ( int )( hh.x ) - fY0;
           int ddz = ( int )( hh.y ) - fZ0;
           int dds = CAMath::Abs( ddy ) + CAMath::Abs( ddz );
@@ -541,7 +643,15 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
         }
       }// end of search for the closest hit
 
-      if ( best < 0 ) break;
+      if ( best < 0 )
+         {
+#ifndef EXTERN_ROW_HITS
+                 tracklet.SetRowHit(iRow, -1);
+#else
+                 tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = -1;
+#endif
+                 break;
+         }
       if ( drawSearch ) {
         #ifdef DRAW
         std::cout << "hit search " << r.fItr << ", row " << iRow << " hit " << best << " found" << std::endl;
@@ -552,7 +662,12 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
                #endif
       }
 
-      ushort2 hh = hits[best];
+      ushort2 hh;
+#if defined(HLTCA_GPU_TEXTURE_FETCH)
+                hh = tex1Dfetch(gAliTexRefu2, ((char*) tracker.Data().HitData() - tracker.pData()->GPUTextureBase()) / sizeof(ushort2) + row.HitNumberOffset() + best);
+#else
+                 hh = hits[best];
+#endif
 
       //std::cout<<"mark 3, "<<r.fItr<<std::endl;
       //tParam.Print();
@@ -582,6 +697,11 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
           std::cout << "found hit is out of the chi2 window\n " << std::endl;
           #endif
         }
+#ifndef EXTERN_ROW_HITS
+               tracklet.SetRowHit(iRow, -1);
+#else
+               tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = -1;
+#endif
         break;
       }
 #ifdef DRAW
@@ -598,7 +718,11 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
         }
         break;
       }
-      if ( SAVE() ) tracklet.SetRowHit( iRow, best );
+#ifndef EXTERN_ROW_HITS
+         tracklet.SetRowHit( iRow, best );
+#else
+         tracker.TrackletRowHits()[iRow * s.fNTracklets + r.fItr] = best;
+#endif
       if ( drawSearch ) {
         #ifdef DRAW
         std::cout << "tracklet " << r.fItr << " after filter at row " << iRow << " : " << std::endl;
@@ -615,70 +739,528 @@ GPUd() void AliHLTTPCCATrackletConstructor::UpdateTracklet
   }
 }
 
+#ifdef HLTCA_GPUCODE
+GPUd() void AliHLTTPCCATrackletConstructor::CopyTrackletTempData( AliHLTTPCCAThreadMemory &rMemSrc, AliHLTTPCCAThreadMemory &rMemDst, AliHLTTPCCATrackParam &tParamSrc, AliHLTTPCCATrackParam &tParamDst)
+{
+       //Copy Temporary Tracklet data from registers to global mem and vice versa
+       rMemDst.fStartRow = rMemSrc.fStartRow;
+       rMemDst.fEndRow = rMemSrc.fEndRow;
+       rMemDst.fFirstRow = rMemSrc.fFirstRow;
+       rMemDst.fLastRow = rMemSrc.fLastRow;
+       rMemDst.fCurrIH =  rMemSrc.fCurrIH;
+       rMemDst.fGo = rMemSrc.fGo;
+       rMemDst.fStage = rMemSrc.fStage;
+       rMemDst.fNHits = rMemSrc.fNHits;
+       rMemDst.fNMissed = rMemSrc.fNMissed;
+       rMemDst.fLastY = rMemSrc.fLastY;
+       rMemDst.fLastZ = rMemSrc.fLastZ;
+
+       tParamDst.SetSinPhi( tParamSrc.GetSinPhi() );
+       tParamDst.SetDzDs( tParamSrc.GetDzDs() );
+       tParamDst.SetQPt( tParamSrc.GetQPt() );
+       tParamDst.SetSignCosPhi( tParamSrc.GetSignCosPhi() );
+       tParamDst.SetChi2( tParamSrc.GetChi2() );
+       tParamDst.SetNDF( tParamSrc.GetNDF() );
+       tParamDst.SetCov( 0, tParamSrc.GetCov(0) );
+       tParamDst.SetCov( 1, tParamSrc.GetCov(1) );
+       tParamDst.SetCov( 2, tParamSrc.GetCov(2) );
+       tParamDst.SetCov( 3, tParamSrc.GetCov(3) );
+       tParamDst.SetCov( 4, tParamSrc.GetCov(4) );
+       tParamDst.SetCov( 5, tParamSrc.GetCov(5) );
+       tParamDst.SetCov( 6, tParamSrc.GetCov(6) );
+       tParamDst.SetCov( 7, tParamSrc.GetCov(7) );
+       tParamDst.SetCov( 8, tParamSrc.GetCov(8) );
+       tParamDst.SetCov( 9, tParamSrc.GetCov(9) );
+       tParamDst.SetCov( 10, tParamSrc.GetCov(10) );
+       tParamDst.SetCov( 11, tParamSrc.GetCov(11) );
+       tParamDst.SetCov( 12, tParamSrc.GetCov(12) );
+       tParamDst.SetCov( 13, tParamSrc.GetCov(13) );
+       tParamDst.SetCov( 14, tParamSrc.GetCov(14) );
+       tParamDst.SetX( tParamSrc.GetX() );
+       tParamDst.SetY( tParamSrc.GetY() );
+       tParamDst.SetZ( tParamSrc.GetZ() );
+}
 
+GPUd() int AliHLTTPCCATrackletConstructor::FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, int Reverse, int RowBlock, int &mustInit)
+{
+       //Fetch a new trackled to be processed by this thread
+       __syncthreads();
+       int nextTracketlFirstRun = sMem.fNextTrackletFirstRun;
+       if (threadIdx.x  == TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+       {
+               sMem.fNTracklets = *tracker.NTracklets();
+               if (sMem.fNextTrackletFirstRun)
+               {
+#ifdef HLTCA_GPU_SCHED_FIXED_START
+                       const int iSlice = tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + (HLTCA_GPU_BLOCK_COUNT % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % HLTCA_GPU_BLOCK_COUNT != 0)) / HLTCA_GPU_BLOCK_COUNT;
+                       const int nSliceBlockOffset = HLTCA_GPU_BLOCK_COUNT * iSlice / tracker.GPUParametersConst()->fGPUnSlices;
+                       const uint2 &nTracklet = tracker.BlockStartingTracklet()[blockIdx.x - nSliceBlockOffset];
+
+                       sMem.fNextTrackletCount = nTracklet.y;
+                       if (sMem.fNextTrackletCount == 0)
+                       {
+                               sMem.fNextTrackletFirstRun = 0;
+                       }
+                       else
+                       {
+                               if (tracker.TrackletStartHits()[nTracklet.x].RowIndex() / HLTCA_GPU_SCHED_ROW_STEP != RowBlock)
+                               {
+                                       sMem.fNextTrackletCount = 0;
+                               }
+                               else
+                               {
+                                       sMem.fNextTrackletFirst = nTracklet.x;
+                                       sMem.fNextTrackletNoDummy = 1;
+                               }
+                       }
+#endif
+               }
+               else
+               {
+                       const int nFetchTracks = CAMath::Max(CAMath::Min((*tracker.RowBlockPos(Reverse, RowBlock)).x - (*tracker.RowBlockPos(Reverse, RowBlock)).y, HLTCA_GPU_THREAD_COUNT - TRACKLET_CONSTRUCTOR_NMEMTHREDS), 0);
+                       sMem.fNextTrackletCount = nFetchTracks;
+                       const int nUseTrack = nFetchTracks ? CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).y, nFetchTracks) : 0;
+                       sMem.fNextTrackletFirst = nUseTrack;
+
+                       const int nFillTracks = CAMath::Min(nFetchTracks, nUseTrack + nFetchTracks - (*((volatile int2*) (tracker.RowBlockPos(Reverse, RowBlock)))).x);
+                       if (nFillTracks > 0)
+                       {
+                               const int nStartFillTrack = CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).x, nFillTracks);\r
+                               if (nFillTracks + nStartFillTrack >= HLTCA_GPU_MAX_TRACKLETS)\r
+                               {\r
+                                       tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_ROWBLOCK_TRACKLET_OVERFLOW;\r
+                               }\r
+                               for (int i = 0;i < nFillTracks;i++)\r
+                               {\r
+                                       tracker.RowBlockTracklets(Reverse, RowBlock)[(nStartFillTrack + i) % HLTCA_GPU_MAX_TRACKLETS] = -3;     //Dummy filling track\r
+                               }\r
+                       }
+                       sMem.fNextTrackletNoDummy = 0;\r
+               }
+       }
+       __syncthreads();
+       mustInit = 0;
+       if (sMem.fNextTrackletCount == 0)
+       {
+               return(-2);             //No more track in this RowBlock
+       }
+#if HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS > 0
+       else if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+       {
+               return(-1);
+       }
+#endif
+       else if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS >= sMem.fNextTrackletCount)
+       {
+               return(-1);             //No track in this RowBlock for this thread
+       }
+       else if (nextTracketlFirstRun)
+       {
+               if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS) sMem.fNextTrackletFirstRun = 0;
+               mustInit = 1;
+               return(sMem.fNextTrackletFirst + threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS);
+       }
+       else
+       {
+               const int nTrackPos = sMem.fNextTrackletFirst + threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS;\r
+               mustInit = (nTrackPos < tracker.RowBlockPos(Reverse, RowBlock)->w);\r
+               volatile int* const ptrTracklet = &tracker.RowBlockTracklets(Reverse, RowBlock)[nTrackPos % HLTCA_GPU_MAX_TRACKLETS];\r
+               int nTracklet;\r
+               int nTryCount = 0;\r
+               while ((nTracklet = *ptrTracklet) == -1)\r
+               {\r
+                       for (int i = 0;i < 10000;i++)\r
+                               sMem.fNextTrackletStupidDummy++;\r
+                       nTryCount++;\r
+                       if (nTryCount > 20)\r
+                       {\r
+                               tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_SCHEDULE_COLLISION;\r
+                               return(-1);\r
+                       }\r
+               };\r
+               return(nTracklet);\r
+       }
+}
 
-GPUd() void AliHLTTPCCATrackletConstructor::Thread
-( int nBlocks, int nThreads, int iBlock, int iThread, int iSync,
-  AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam )
+GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorNewGPU(AliHLTTPCCATracker *pTracker)
 {
+       //Main Tracklet construction function that calls the scheduled (FetchTracklet) and then Processes the tracklet (mainly UpdataTracklet) and at the end stores the tracklet.
+       //Can also dispatch a tracklet to be rescheduled
+#ifdef HLTCA_GPU_EMULATION_SINGLE_TRACKLET
+       pTracker[0].BlockStartingTracklet()[0].x = HLTCA_GPU_EMULATION_SINGLE_TRACKLET;
+       pTracker[0].BlockStartingTracklet()[0].y = 1;
+       for (int i = 1;i < HLTCA_GPU_BLOCK_COUNT;i++)
+       {
+               pTracker[0].BlockStartingTracklet()[i].x = pTracker[0].BlockStartingTracklet()[i].y = 0;
+       }
+#endif
 
-  // reconstruction of tracklets
-  if ( iSync == 0 ) {
-    Step0( nBlocks, nThreads, iBlock, iThread, s, r, tracker, tParam );
-  } else if ( iSync == 1 ) {
-    Step1( nBlocks, nThreads, iBlock, iThread, s, r, tracker, tParam );
-  } else if ( iSync == 2 ) {
-    Step2( nBlocks, nThreads, iBlock, iThread, s, r, tracker, tParam );
-  }
+       GPUshared() AliHLTTPCCASharedMemory sMem;
 
-  else if ( iSync == 3 )
-
-  {
-    r.fCurrentData = 1;
-    ReadData( iThread, s, r, tracker, s.fMinStartRow );
-    r.fCurrentData = 0;
-    r.fNMissed = 0;
-  } else if ( iSync == 3 + 159 + 1 ) {
-    r.fCurrentData = 1;
-    int nextRow = s.fMaxEndRow;
-    if ( nextRow < 0 ) nextRow = 0;
-    ReadData( iThread, s, r, tracker, nextRow );
-    r.fCurrentData = 0;
-    r.fNMissed = 0;
-    r.fStage = 2;
-    if ( r.fGo ) {
-      const AliHLTTPCCARow &row = tracker.Row( r.fEndRow );
-      float x = row.X();
-      if ( !tParam.TransportToX( x, tracker.Param().ConstBz(), .999 ) ) r.fGo = 0;
-    }
-  }
+#ifdef HLTCA_GPU_SCHED_FIXED_START
+       if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+       {
+               sMem.fNextTrackletFirstRun = 1;
+       }
+       __syncthreads();
+#endif
 
-  else if ( iSync <= 3 + 159 + 1 + 159 ) {
-    int iRow, nextRow;
-    if (  iSync <= 3 + 159 ) {
-      iRow = iSync - 4;
-      if ( iRow < s.fMinStartRow ) return;
-      nextRow = iRow + 1;
-      if ( nextRow > 158 ) nextRow = 158;
-    } else {
-      iRow = 158 - ( iSync - 4 - 159 - 1 );
-      if ( iRow > s.fMaxEndRow ) return;
-      nextRow = iRow - 1;
-      if ( nextRow < 0 ) nextRow = 0;
-    }
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+       if (threadIdx.x == TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+       {
+               sMem.fMaxSync = 0;
+       }
+       int threadSync = 0;
+#endif
 
-    if ( r.fIsMemThread ) {
-      ReadData( iThread, s, r, tracker, nextRow );
-    } else {
-      UpdateTracklet( nBlocks, nThreads, iBlock, iThread,
-                      s, r, tracker, tParam, iRow );
-    }
-    r.fCurrentData = !r.fCurrentData;
-  }
+       for (int iReverse = 0;iReverse < 2;iReverse++)
+       {
+               for (int iRowBlock = 0;iRowBlock < HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1;iRowBlock++)
+               {
+#ifdef HLTCA_GPU_SCHED_FIXED_SLICE
+                       int iSlice = tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + (HLTCA_GPU_BLOCK_COUNT % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % HLTCA_GPU_BLOCK_COUNT != 0)) / HLTCA_GPU_BLOCK_COUNT;
+#else
+                       for (int iSlice = 0;iSlice < pTracker[0].GPUParametersConst()->fGPUnSlices;iSlice++)
+#endif
+                       {
+                               AliHLTTPCCATracker &tracker = pTracker[iSlice];
+                               if (sMem.fNextTrackletFirstRun && iSlice != tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + (HLTCA_GPU_BLOCK_COUNT % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (blockIdx.x + 1) % HLTCA_GPU_BLOCK_COUNT != 0)) / HLTCA_GPU_BLOCK_COUNT)
+                               {
+                                       continue;
+                               }
+                               /*if (!sMem.fNextTrackletFirstRun && tracker.RowBlockPos(1, HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP)->x <= tracker.RowBlockPos(1, HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP)->y)
+                               {
+                                       continue;
+                               }*/
+                               int sharedRowsInitialized = 0;
+
+                               int iTracklet;
+                               int mustInit;
+                               while ((iTracklet = FetchTracklet(tracker, sMem, iReverse, iRowBlock, mustInit)) != -2)
+                               {
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                       CAMath::AtomicMax(&sMem.fMaxSync, threadSync);
+                                       __syncthreads();
+                                       threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / blockDim.x / gridDim.x);
+#endif
+#ifndef HLTCA_GPU_PREFETCHDATA
+                                       if (!sharedRowsInitialized)
+                                       {
+#ifdef HLTCA_GPU_PREFETCH_ROWBLOCK_ONLY
+                                               if (iReverse)
+                                               {
+                                                       for (int i = CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP) * sizeof(AliHLTTPCCARow) / sizeof(int) + threadIdx.x;i < (HLTCA_ROW_COUNT - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP) * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
+                                                       {
+                                                               reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker.SliceDataRows())[i];
+                                                       }
+                                               }
+                                               else
+                                               {
+                                                       for (int i = CAMath::Max(1, iRowBlock * HLTCA_GPU_SCHED_ROW_STEP) * sizeof(AliHLTTPCCARow) / sizeof(int) + threadIdx.x;i < CAMath::Min((iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP, HLTCA_ROW_COUNT) * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
+                                                       {
+                                                               reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker.SliceDataRows())[i];
+                                                       }
+                                               }
+#else
+                                               for (int i = threadIdx.x;i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += blockDim.x)
+                                               {
+                                                       reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker.SliceDataRows())[i];
+                                               }
+#endif
+                                               sharedRowsInitialized = 1;
+                                       }
+#endif
+#ifdef HLTCA_GPU_RESCHED
+                                       short2 storeToRowBlock;
+                                       int storePosition = 0;
+                                       if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
+                                       {
+                                               const int nReverse = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nRowBlock = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               sMem.fTrackletStoreCount[nReverse][nRowBlock] = 0;
+                                       }
+#endif
+                                       __syncthreads();
+                                       AliHLTTPCCATrackParam tParam;
+                                       AliHLTTPCCAThreadMemory rMem;
 
-  else if ( iSync == 4 + 159*2 + 1 + 1 ) { //
-    StoreTracklet( nBlocks, nThreads, iBlock, iThread,
-                   s, r, tracker, tParam );
-  }
+                                       rMem.fCurrentData = 0;
+
+#ifdef HLTCA_GPU_EMULATION_DEBUG_TRACKLET
+                                       if (iTracklet == HLTCA_GPU_EMULATION_DEBUG_TRACKLET)
+                                       {
+                                               tracker.GPUParameters()->fGPUError = 1;
+                                       }
+#endif
+                                       AliHLTTPCCAThreadMemory &rMemGlobal = tracker.GPUTrackletTemp()[iTracklet].fThreadMem;
+                                       AliHLTTPCCATrackParam &tParamGlobal = tracker.GPUTrackletTemp()[iTracklet].fParam;
+                                       if (mustInit)
+                                       {
+                                               AliHLTTPCCAHitId id = tracker.TrackletStartHits()[iTracklet];
+
+                                               rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex();
+                                               rMem.fCurrIH = id.HitIndex();
+                                               rMem.fStage = 0;
+                                               rMem.fNHits = 0;
+                                               rMem.fNMissed = 0;
+
+                                               AliHLTTPCCATrackletConstructor::InitTracklet(tParam);
+                                       }
+                                       else if (iTracklet >= 0)
+                                       {
+                                               CopyTrackletTempData( rMemGlobal, rMem, tParamGlobal, tParam );
+                                       }
+#ifdef HLTCA_GPU_PREFETCHDATA
+                                       else if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS)
+                                       {
+                                               ReadData(threadIdx.x, sMem, rMem, tracker, iReverse ? (HLTCA_ROW_COUNT - 1 - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP) : (CAMath::Max(1, iRowBlock * HLTCA_GPU_SCHED_ROW_STEP)));
+                                       }
+#endif
+                                       rMem.fItr = iTracklet;
+                                       rMem.fGo = (iTracklet >= 0);
+
+#ifdef HLTCA_GPU_RESCHED
+                                       storeToRowBlock.x = iRowBlock + 1;
+                                       storeToRowBlock.y = iReverse;
+#ifdef HLTCA_GPU_PREFETCHDATA
+                                       rMem.fCurrentData ^= 1;
+                                       __syncthreads();
+#endif
+                                       if (iReverse)
+                                       {
+                                               for (int j = HLTCA_ROW_COUNT - 1 - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP;j >= CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP);j--)
+                                               {
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                                       if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(j >= rMem.fEndRow || ( j >= rMem.fStartRow && j - rMem.fStartRow % 2 == 0)))
+                                                               pTracker[0].fStageAtSync[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
+#endif
+#ifdef HLTCA_GPU_PREFETCHDATA
+                                                       if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS && j > CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP))
+                                                       {
+                                                               ReadData(threadIdx.x, sMem, rMem, tracker, j - 1);
+                                                       }
+                                                       else
+#endif
+                                                       if (iTracklet >= 0)
+                                                       {
+                                                               UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
+                                                               if (rMem.fNMissed > kMaxRowGap)
+                                                               {
+                                                                       rMem.fGo = 0;
+#ifndef HLTCA_GPU_PREFETCHDATA
+                                                                       break;
+#endif
+                                                               }
+                                                       }
+#ifdef HLTCA_GPU_PREFETCHDATA
+                                                       __syncthreads();
+                                                       rMem.fCurrentData ^= 1;
+#endif
+                                               }
+                                                       
+                                               if (iTracklet >= 0 && (!rMem.fGo || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP))
+                                               {
+                                                       StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam );
+                                               }
+                                       }
+                                       else
+                                       {
+                                               for (int j = CAMath::Max(1, iRowBlock * HLTCA_GPU_SCHED_ROW_STEP);j < CAMath::Min((iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP, HLTCA_ROW_COUNT);j++)
+                                               {
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+                                                       if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && j >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (j - rMem.fStartRow) % 2 == 0 ))
+                                                               pTracker[0].fStageAtSync[threadSync++ * blockDim.x * gridDim.x + blockIdx.x * blockDim.x + threadIdx.x] = rMem.fStage + 1;
+#endif
+#ifdef HLTCA_GPU_PREFETCHDATA
+                                                       if (threadIdx.x < TRACKLET_CONSTRUCTOR_NMEMTHREDS && j < CAMath::Min((iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP, HLTCA_ROW_COUNT) - 1)
+                                                       {
+                                                               ReadData(threadIdx.x, sMem, rMem, tracker, j + 1);
+                                                       }
+                                                       else
+#endif 
+                                                       if (iTracklet >= 0)
+                                                       {
+                                                               UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);
+#ifndef HLTCA_GPU_PREFETCHDATA
+                                                               //if (rMem.fNMissed > kMaxRowGap || rMem.fGo == 0) break;       //DR!!! CUDA Crashes with this enabled
+#endif
+                                                       }
+#ifdef HLTCA_GPU_PREFETCHDATA
+                                                       __syncthreads();
+                                                       rMem.fCurrentData ^= 1;
+#endif
+                                               }
+                                               if (rMem.fGo && (rMem.fNMissed > kMaxRowGap || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP))
+                                               {
+#if defined(HLTCA_GPU_PREFETCHDATA) | !defined(HLTCA_GPU_PREFETCH_ROWBLOCK_ONLY)
+                                                       if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) )
+#else
+                                                       if ( !tParam.TransportToX( sMem.fRows[ rMem.fEndRow ].X(), tracker.Param().ConstBz(), .999 ) )
+#endif
+                                                       {
+                                                               rMem.fGo = 0;
+                                                       }
+                                                       else
+                                                       {
+                                                               storeToRowBlock.x = (HLTCA_ROW_COUNT - rMem.fEndRow) / HLTCA_GPU_SCHED_ROW_STEP;
+                                                               storeToRowBlock.y = 1;
+                                                               rMem.fNMissed = 0;
+                                                               rMem.fStage = 2;
+                                                       }
+                                               }
+
+                                               if (iTracklet >= 0 && !rMem.fGo)
+                                               {
+                                                       StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam );
+                                               }
+                                       }
+
+                                       if (rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0))
+                                       {
+                                               CopyTrackletTempData( rMem, rMemGlobal, tParam, tParamGlobal );
+                                               storePosition = CAMath::AtomicAdd(&sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x], 1);
+                                       }
+#else
+                                       if (iTracklet >= 0)
+                                       {
+                                               for (int j = rMem.fStartRow;j < HLTCA_ROW_COUNT;j++)\r
+                                               {\r
+                                                       UpdateTracklet(gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);\r
+                                                       if (!rMem.fGo) break;\r
+                                               }\r
+\r
+                                               rMem.fNMissed = 0;\r
+                                               rMem.fStage = 2;\r
+                                               if ( rMem.fGo )\r
+                                               {\r
+                                                       if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) )  rMem.fGo = 0;\r
+                                               }\r
+\r
+                                               for (int j = rMem.fEndRow;j >= 0;j--)\r
+                                               {\r
+                                                       if (!rMem.fGo) break;\r
+                                                       UpdateTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam, j);\r
+                                               }\r
+\r
+                                               StoreTracklet( gridDim.x, blockDim.x, blockIdx.x, threadIdx.x, sMem, rMem, tracker, tParam );
+                                       }
+#endif
+#ifdef HLTCA_GPU_RESCHED
+                                       __syncthreads();
+                                       if (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
+                                       {
+                                               const int nReverse = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               const int nRowBlock = (threadIdx.x - TRACKLET_CONSTRUCTOR_NMEMTHREDS) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
+                                               if (sMem.fTrackletStoreCount[nReverse][nRowBlock])
+                                               {
+                                                       sMem.fTrackletStoreCount[nReverse][nRowBlock] = CAMath::AtomicAdd(&tracker.RowBlockPos(nReverse, nRowBlock)->x, sMem.fTrackletStoreCount[nReverse][nRowBlock]);
+                                               }
+                                       }
+                                       __syncthreads();
+                                       if (iTracklet >= 0 && rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0))
+                                       {
+                                               tracker.RowBlockTracklets(storeToRowBlock.y, storeToRowBlock.x)[sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x] + storePosition] = iTracklet;
+                                       }
+                                       __syncthreads();
+#endif
+                               }
+                       }
+               }
+       }
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+
+#endif
 }
 
+GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit(int iTracklet, AliHLTTPCCATracker &tracker)
+{
+       //Initialize Row Blocks
+
+#ifndef HLTCA_GPU_EMULATION_SINGLE_TRACKLET
+AliHLTTPCCAHitId id = tracker.TrackletStartHits()[iTracklet];
+#ifdef HLTCA_GPU_SCHED_FIXED_START
+       const int firstDynamicTracklet = tracker.GPUParameters()->fScheduleFirstDynamicTracklet;
+       if (iTracklet >= firstDynamicTracklet)
+#endif
+       {
+               const int firstTrackletInRowBlock = CAMath::Max(firstDynamicTracklet, tracker.RowStartHitCountOffset()[CAMath::Max(id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP * HLTCA_GPU_SCHED_ROW_STEP, 1)].z);
+               if (iTracklet == firstTrackletInRowBlock)
+               {
+                       const int firstRowInNextBlock = (id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_SCHED_ROW_STEP;
+                       int trackletsInRowBlock;
+                       if (firstRowInNextBlock >= HLTCA_ROW_COUNT - 3)
+                               trackletsInRowBlock = *tracker.NTracklets() - firstTrackletInRowBlock;
+                       else
+                               trackletsInRowBlock = CAMath::Max(firstDynamicTracklet, tracker.RowStartHitCountOffset()[firstRowInNextBlock].z) - firstTrackletInRowBlock;
+
+                       tracker.RowBlockPos(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)->x = trackletsInRowBlock;
+                       tracker.RowBlockPos(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)->w = trackletsInRowBlock;
+               }
+               tracker.RowBlockTracklets(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)[iTracklet - firstTrackletInRowBlock] = iTracklet;
+       }
+#endif
+}
+
+GPUg() void AliHLTTPCCATrackletConstructorInit(int iSlice)
+{
+       //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit
+       AliHLTTPCCATracker &tracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[iSlice];
+       int i = blockIdx.x * blockDim.x + threadIdx.x;
+       if (i >= *tracker.NTracklets()) return;
+       AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit(i, tracker);
+}
+
+GPUg() void AliHLTTPCCATrackletConstructorNewGPU()
+{
+       //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorNewGPU
+       AliHLTTPCCATracker *pTracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker );
+       AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorNewGPU(pTracker);
+}
+
+#else
+GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorNewCPU(AliHLTTPCCATracker &tracker)
+{
+       //Tracklet constructor simple CPU Function that does not neew a scheduler
+       GPUshared() AliHLTTPCCASharedMemory sMem;
+       sMem.fNTracklets = *tracker.NTracklets();
+       for (int iTracklet = 0;iTracklet < *tracker.NTracklets();iTracklet++)
+       {
+               AliHLTTPCCATrackParam tParam;
+               AliHLTTPCCAThreadMemory rMem;
+               
+               AliHLTTPCCAHitId id = tracker.TrackletStartHits()[iTracklet];
+
+               rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex();
+               rMem.fCurrIH = id.HitIndex();
+               rMem.fStage = 0;
+               rMem.fNHits = 0;
+               rMem.fNMissed = 0;
+
+               AliHLTTPCCATrackletConstructor::InitTracklet(tParam);
+
+               rMem.fItr = iTracklet;
+               rMem.fGo = 1;
+
+               for (int j = rMem.fStartRow;j < tracker.Param().NRows();j++)\r
+               {\r
+                       UpdateTracklet(1, 1, 0, iTracklet, sMem, rMem, tracker, tParam, j);\r
+                       if (!rMem.fGo) break;\r
+               }\r
+\r
+               rMem.fNMissed = 0;\r
+               rMem.fStage = 2;\r
+               if ( rMem.fGo )\r
+               {\r
+                       if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) ) rMem.fGo = 0;\r
+               }\r
+\r
+               for (int j = rMem.fEndRow;j >= 0;j--)\r
+               {\r
+                       if (!rMem.fGo) break;\r
+                       UpdateTracklet( 1, 1, 0, iTracklet, sMem, rMem, tracker, tParam, j);\r
+               }\r
+\r
+               StoreTracklet( 1, 1, 0, iTracklet, sMem, rMem, tracker, tParam );
+       }
+}
+#endif
index 2df55b2..dfd272c 100644 (file)
@@ -9,13 +9,22 @@
 #ifndef ALIHLTTPCCATRACKLETCONSTRUCTOR_H
 #define ALIHLTTPCCATRACKLETCONSTRUCTOR_H
 
+#ifdef HLTCA_GPUCODE
+#define HLTCA_GPU_USE_INT short
+#else
+#define HLTCA_GPU_USE_INT int
+#endif
 
 #include "AliHLTTPCCADef.h"
+#include "AliHLTTPCCAGPUConfig.h"
+#include "AliHLTTPCCATrackParam.h"
 
 /**
  * @class AliHLTTPCCATrackletConstructor
  *
  */
+class AliHLTTPCCATracker;
+
 class AliHLTTPCCATrackletConstructor
 {
   public:
@@ -26,24 +35,33 @@ class AliHLTTPCCATrackletConstructor
       public:
 #if !defined(HLTCA_GPUCODE)
         AliHLTTPCCASharedMemory()
-            : fItr0( 0 ), fItr1( 0 ), fNRows( 0 ), fMinStartRow( 0 ), fMaxEndRow( 0 ) {}
+                       : fNextTrackletFirst(0), fNextTrackletCount(0), fNextTrackletNoDummy(0), fNextTrackletStupidDummy(0), fNextTrackletFirstRun(0), fNTracklets(0), fSliceDone(0) {}
 
         AliHLTTPCCASharedMemory( const AliHLTTPCCASharedMemory& /*dummy*/ )
-            : fItr0( 0 ), fItr1( 0 ), fNRows( 0 ), fMinStartRow( 0 ), fMaxEndRow( 0 ) {}
+                       : fNextTrackletFirst(0), fNextTrackletCount(0), fNextTrackletNoDummy(0), fNextTrackletStupidDummy(0), fNextTrackletFirstRun(0), fNTracklets(0), fSliceDone(0) {}
         AliHLTTPCCASharedMemory& operator=( const AliHLTTPCCASharedMemory& /*dummy*/ ) { return *this; }
 #endif
 
-#ifndef CUDA_DEVICE_EMULATION
       protected:
+#ifdef HLTCA_GPU_PREFETCHDATA
+        uint4 fData[2][ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM / 4]; // temp memory
+               AliHLTTPCCARow fRow[2]; // row
+#else
+               AliHLTTPCCARow fRows[HLTCA_ROW_COUNT];
+#endif
+               int fNextTrackletFirst;
+               int fNextTrackletCount;
+               int fNextTrackletNoDummy;
+               int fNextTrackletStupidDummy;
+               int fNextTrackletFirstRun;
+               int fNTracklets;
+               int fSliceDone;
+
+#ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
+               int fMaxSync;
 #endif
 
-        uint4 fData[2][( ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM+ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM+ALIHLTTPCCATRACKLET_CONSTRUCTOR_TEMP_MEM )/4]; // temp memory
-        int fItr0; // start track index
-        int fItr1; // end track index
-        int fNRows; // n rows
-        int fMinStartRow; // min start row
-        int fMinStartRow32[32]; // min start row for each thread in warp
-        int fMaxEndRow; // max start row
+               int fTrackletStoreCount[2][HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1];
     };
 
     class  AliHLTTPCCAThreadMemory
@@ -52,25 +70,21 @@ class AliHLTTPCCATrackletConstructor
       public:
 #if !defined(HLTCA_GPUCODE)
         AliHLTTPCCAThreadMemory()
-            : fItr( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fStartRow( 0 ), fEndRow( 0 ), fCurrIH( 0 ), fIsMemThread( 0 ), fGo( 0 ), fSave( 0 ), fCurrentData( 0 ), fStage( 0 ), fNHits( 0 ), fNMissed( 0 ), fLastY( 0 ), fLastZ( 0 ) {}
+            : fItr( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fStartRow( 0 ), fEndRow( 0 ), fCurrIH( 0 ), fGo( 0 ), fCurrentData( 0 ), fStage( 0 ), fNHits( 0 ), fNMissed( 0 ), fLastY( 0 ), fLastZ( 0 ) {}
 
         AliHLTTPCCAThreadMemory( const AliHLTTPCCAThreadMemory& /*dummy*/ )
-            : fItr( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fStartRow( 0 ), fEndRow( 0 ), fCurrIH( 0 ), fIsMemThread( 0 ), fGo( 0 ), fSave( 0 ), fCurrentData( 0 ), fStage( 0 ), fNHits( 0 ), fNMissed( 0 ), fLastY( 0 ), fLastZ( 0 ) {}
+            : fItr( 0 ), fFirstRow( 0 ), fLastRow( 0 ), fStartRow( 0 ), fEndRow( 0 ), fCurrIH( 0 ), fGo( 0 ), fCurrentData( 0 ), fStage( 0 ), fNHits( 0 ), fNMissed( 0 ), fLastY( 0 ), fLastZ( 0 ) {}
         AliHLTTPCCAThreadMemory& operator=( const AliHLTTPCCAThreadMemory& /*dummy*/ ) { return *this; }
 #endif
 
-#ifndef CUDA_DEVICE_EMULATION
       protected:
-#endif
         int fItr; // track index
         int fFirstRow;  // first row index
         int fLastRow; // last row index
         int fStartRow;  // first row index
         int fEndRow;  // first row index
         int fCurrIH; // indef of the current hit
-        bool fIsMemThread; // is the thread used for memory taken
         bool fGo; // do fit/searching flag
-        bool fSave; // save flag
         bool fCurrentData; // index of the current memory array
         int fStage; // reco stage
         int fNHits; // n track hits
@@ -79,21 +93,13 @@ class AliHLTTPCCATrackletConstructor
         float fLastZ; // Z of the last fitted cluster
     };
 
-    GPUd() static int NThreadSyncPoints() { return 4 + 159*2 + 1 + 1; }
+       struct AliHLTTPCCAGPUTempMemory
+       {
+               AliHLTTPCCAThreadMemory fThreadMem;
+               AliHLTTPCCATrackParam fParam;
+       };
 
-    GPUd() static void Thread( int nBlocks, int nThreads, int iBlock, int iThread,
-                               int iSync, AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r,
-                               AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam );
-
-    GPUd() static void Step0
-    ( int nBlocks, int nThreads, int iBlock, int iThread,
-      AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam );
-    GPUd() static void Step1
-    ( int nBlocks, int nThreads, int iBlock, int iThread,
-      AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam );
-    GPUd() static void Step2
-    ( int nBlocks, int nThreads, int iBlock, int iThread,
-      AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam );
+       GPUd() static void InitTracklet ( AliHLTTPCCATrackParam &tParam );
 
     GPUd() static void ReadData( int iThread, AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, int iRow );
 
@@ -105,11 +111,20 @@ class AliHLTTPCCATrackletConstructor
     ( int nBlocks, int nThreads, int iBlock, int iThread,
       AliHLTTPCCASharedMemory &s, AliHLTTPCCAThreadMemory &r, AliHLTTPCCATracker &tracker, AliHLTTPCCATrackParam &tParam );
 
+#ifdef HLTCA_GPUCODE
+       GPUd() static void AliHLTTPCCATrackletConstructorNewGPU(AliHLTTPCCATracker *pTracker);
+       GPUd() static int FetchTracklet(AliHLTTPCCATracker &tracker, AliHLTTPCCASharedMemory &sMem, int Reverse, int RowBlock, int &mustInit);
+       GPUd() static void AliHLTTPCCATrackletConstructorInit(int iTracklet, AliHLTTPCCATracker &tracke);
+       GPUd() static void CopyTrackletTempData( AliHLTTPCCAThreadMemory &rMemSrc, AliHLTTPCCAThreadMemory &rMemDst, AliHLTTPCCATrackParam &tParamSrc, AliHLTTPCCATrackParam &tParamDst);
+#else
+       GPUd() static void AliHLTTPCCATrackletConstructorNewCPU(AliHLTTPCCATracker &tracker);
+#endif
+
     GPUd() static bool SAVE() { return 1; }
 
 #if defined(HLTCA_GPUCODE)
     //GPUhd() inline int NMemThreads() { return 128; }
-#define TRACKLET_CONSTRUCTOR_NMEMTHREDS 128
+#define TRACKLET_CONSTRUCTOR_NMEMTHREDS HLTCA_GPU_TRACKLET_CONSTRUCTOR_NMEMTHREDS
 #else
     //GPUhd() inline int NMemThreads() { return 1; }
 #define TRACKLET_CONSTRUCTOR_NMEMTHREDS 1
@@ -118,4 +133,5 @@ class AliHLTTPCCATrackletConstructor
 };
 
 
+
 #endif
index 72aa51e..1660856 100644 (file)
@@ -33,24 +33,31 @@ GPUd() void AliHLTTPCCATrackletSelector::Thread
 
   if ( iSync == 0 ) {
     if ( iThread == 0 ) {
-      if ( iBlock == 0 ) {
-        CAMath::AtomicExch( tracker.NTracks(), 0 );
-        CAMath::AtomicExch( tracker.NTrackHits(), 0 );
-      }
       s.fNTracklets = *tracker.NTracklets();
       s.fNThreadsTotal = nThreads * nBlocks;
       s.fItr0 = nThreads * iBlock;
     }
   } else if ( iSync == 1 ) {
-    AliHLTTPCCATrack tout;
-    AliHLTTPCCAHitId trackHits[160];
+       int nHits, nFirstTrackHit;
+       AliHLTTPCCAHitId trackHits[160 - HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE];
 
     for ( int itr = s.fItr0 + iThread; itr < s.fNTracklets; itr += s.fNThreadsTotal ) {
 
-      AliHLTTPCCATracklet &tracklet = tracker.Tracklets()[itr];
+#ifdef HLTCA_GPU_EMULATION_DEBUG_TRACKLET
+       if (itr == HLTCA_GPU_EMULATION_DEBUG_TRACKLET)
+       {
+               tracker.GPUParameters()->fGPUError = 1;
+       }
+#endif
+
+         while (tracker.Tracklets()[itr].NHits() == 0)
+         {
+                 itr += s.fNThreadsTotal;
+                 if (itr >= s.fNTracklets) return;
+         }
 
-      int tNHits = tracklet.NHits();
-      if ( tNHits <= 0 ) continue;
+         AliHLTTPCCATracklet &tracklet = tracker.Tracklets()[itr];
+      const int tNHits = tracklet.NHits();
 
       const int kMaxRowGap = 4;
       const float kMaxShared = .1;
@@ -58,7 +65,6 @@ GPUd() void AliHLTTPCCATrackletSelector::Thread
       int firstRow = tracklet.FirstRow();
       int lastRow = tracklet.LastRow();
 
-      tout.SetNHits( 0 );
       int kind = 0;
       if ( 0 ) {
         if ( tNHits >= 10 && 1. / .5 >= CAMath::Abs( tracklet.Param().QPt() ) ) { //SG!!!
@@ -70,40 +76,70 @@ GPUd() void AliHLTTPCCATrackletSelector::Thread
 
       //int w = (tNHits<<16)+itr;
       //int nRows = tracker.Param().NRows();
+      //std::cout<<" store tracklet: "<<firstRow<<" "<<lastRow<<std::endl;
+
+         int irow = firstRow;
+
       int gap = 0;
       int nShared = 0;
-      //std::cout<<" store tracklet: "<<firstRow<<" "<<lastRow<<std::endl;
-      for ( int irow = firstRow; irow <= lastRow; irow++ ) {
+         nHits = 0;
+
+         for (irow = firstRow; irow <= lastRow && lastRow - irow + nHits >= TRACKLET_SELECTOR_MIN_HITS; irow++ ) {
         gap++;
-        int ih = tracklet.RowHit( irow );
+#ifdef EXTERN_ROW_HITS
+        int ih = tracker.TrackletRowHits()[irow * s.fNTracklets + itr];
+#else
+               int ih = tracklet.RowHit( irow );
+#endif
         if ( ih >= 0 ) {
           const AliHLTTPCCARow &row = tracker.Row( irow );
           bool own = ( tracker.HitWeight( row, ih ) <= w );
-          bool sharedOK = ( ( tout.NHits() < 0 ) || ( nShared < tout.NHits() * kMaxShared ) );
+          bool sharedOK = ( ( nShared < nHits * kMaxShared ) );
           if ( own || sharedOK ) {//SG!!!
             gap = 0;
-            trackHits[tout.NHits()].Set( irow, ih );
-            tout.SetNHits( tout.NHits() + 1 );
+#if HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE != 0
+                       if (nHits < HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE)
+                               s.fHits[iThread][nHits].Set( irow, ih );
+                       else
+#endif
+                               trackHits[nHits - HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE].Set( irow, ih );
+            nHits++;
             if ( !own ) nShared++;
           }
         }
 
         if ( gap > kMaxRowGap || irow == lastRow ) { // store
-          if ( tout.NHits() >= 10 ) { //SG!!!
+          if ( nHits >= TRACKLET_SELECTOR_MIN_HITS ) { //SG!!!
             int itrout = CAMath::AtomicAdd( tracker.NTracks(), 1 );
-            tout.SetFirstHitID( CAMath::AtomicAdd( tracker.NTrackHits(), tout.NHits() ) );
-            tout.SetParam( tracklet.Param() );
-            tout.SetAlive( 1 );
-            tracker.Tracks()[itrout] = tout;
-            for ( int jh = 0; jh < tout.NHits(); jh++ ) {
-              tracker.TrackHits()[tout.FirstHitID() + jh] = trackHits[jh];
+#ifdef HLTCA_GPUCODE
+                       if (itrout >= HLTCA_GPU_MAX_TRACKS)\r
+                       {\r
+                               tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_TRACK_OVERFLOW;\r
+                               CAMath::AtomicExch( tracker.NTracks(), 0 );\r
+                               return;\r
+                       }\r
+#endif
+            nFirstTrackHit = CAMath::AtomicAdd( tracker.NTrackHits(), nHits );
+                       tracker.Tracks()[itrout].SetAlive(1);
+                       tracker.Tracks()[itrout].SetParam(tracklet.Param());
+                       tracker.Tracks()[itrout].SetFirstHitID(nFirstTrackHit);
+                       tracker.Tracks()[itrout].SetNHits(nHits);
+            for ( int jh = 0; jh < nHits; jh++ ) {
+#if HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE != 0
+                               if (jh < HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE)
+                                       tracker.TrackHits()[nFirstTrackHit + jh] = s.fHits[iThread][jh];
+                               else
+#endif
+                                       tracker.TrackHits()[nFirstTrackHit + jh] = trackHits[jh - HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE];
             }
           }
-          tout.SetNHits( 0 );
+          nHits = 0;
           gap = 0;
           nShared = 0;
         }
       }
+
+
     }
   }
 }
index 7fb238b..a08f452 100644 (file)
@@ -11,6 +11,8 @@
 
 
 #include "AliHLTTPCCADef.h"
+#include "AliHLTTPCCAHitId.h"
+#include "AliHLTTPCCAGPUConfig.h"
 class AliHLTTPCCATracker;
 
 /**
@@ -23,15 +25,15 @@ class AliHLTTPCCATrackletSelector
     class AliHLTTPCCASharedMemory
     {
         friend class AliHLTTPCCATrackletSelector;
-#ifndef CUDA_DEVICE_EMULATION
-      protected:
-#else
-         public:
-#endif
+
+       protected:
         int fItr0; // index of the first track in the block
         int fNThreadsTotal; // total n threads
         int fNTracklets; // n of tracklets
-    };
+#if HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE != 0
+               AliHLTTPCCAHitId fHits[HLTCA_GPU_THREAD_COUNT][HLTCA_GPU_TRACKLET_SELECTOR_HITS_REG_SIZE];
+#endif
+       };
 
     GPUd() static int NThreadSyncPoints() { return 1; }
 
index f4ff3cf..7c7c412 100644 (file)
@@ -144,7 +144,8 @@ AliTPCtrackerCA::AliTPCtrackerCA( const AliTPCParam *par ):
         }
       }
     }
-    hlt.SliceTracker( iSlice ).Initialize( param );
+    //hlt.SliceTracker( iSlice ).Initialize( param );
+       hlt.InitializeSliceParam(iSlice, param);
   }
 }
 
@@ -408,15 +409,16 @@ int AliTPCtrackerCA::Clusters2Tracks( AliESDEvent *event )
         tTPC.SetClusterPointer( row, c );
         AliTPCTrackerPoint &point = *( tTPC.GetTrackPoint( row ) );
         {
-          AliHLTTPCCATracker &slice = hlt.SliceTracker( iSlice );
-          if ( slice.Param().Alpha() != alpha ) {
-            if ( ! t0.Rotate(  slice.Param().Alpha() - alpha, .999 ) ) continue;
-            alpha = slice.Param().Alpha();
+          //AliHLTTPCCATracker &slice = hlt.SliceTracker( iSlice );
+          if ( hlt.Param(iSlice).Alpha() != alpha ) {
+            if ( ! t0.Rotate(  hlt.Param(iSlice).Alpha() - alpha, .999 ) ) continue;
+            alpha = hlt.Param(iSlice).Alpha();
           }
-          float x = slice.Row( row ).X();
-          if ( !t0.TransportToX( x, slice.Param().GetBz( t0 ), .999 ) ) continue;
+          float x = hlt.Row(iSlice, row).X();
+          if ( !t0.TransportToX( x, hlt.Param(iSlice).GetBz( t0 ), .999 ) ) continue;
           float sy2, sz2;
-          slice.GetErrors2( row, t0, sy2, sz2 );
+          //slice.GetErrors2( row, t0, sy2, sz2 );
+                 hlt.Param(iSlice).GetClusterErrors2( row, t0.GetZ(), t0.SinPhi(), t0.GetCosPhi(), t0.DzDs(), sy2, sz2 );
           point.SetSigmaY( c->GetSigmaY2() / sy2 );
           point.SetSigmaZ( c->GetSigmaZ2() / sz2 );
           point.SetAngleY( TMath::Abs( t0.GetSinPhi() / t0.GetCosPhi() ) );
index 8cf3e41..2a48f76 100644 (file)
@@ -33,7 +33,7 @@ GPUhd() static inline void AlignTo( char *&mem )
 }
 
 template<unsigned int X>
-static inline unsigned int NextMultipleOf( unsigned int value )
+GPUhd() static inline unsigned int NextMultipleOf( unsigned int value )
 {
   STATIC_ASSERT( ( X & ( X - 1 ) ) == 0, X_needs_to_be_a_multiple_of_2 );
   const int offset = value & ( X - 1 );