1 #include "AliHLTTPCCAGPUConfig.h"
3 MEM_TEMPLATE4() GPUdi() void AliHLTTPCCATrackletConstructor::CopyTrackletTempData( MEM_TYPE(AliHLTTPCCAThreadMemory) &rMemSrc, MEM_TYPE2(AliHLTTPCCAThreadMemory) &rMemDst, MEM_TYPE3(AliHLTTPCCATrackParam) &tParamSrc, MEM_TYPE4(AliHLTTPCCATrackParam) &tParamDst)
5 //Copy Temporary Tracklet data from registers to global mem and vice versa
6 rMemDst.fStartRow = rMemSrc.fStartRow;
7 rMemDst.fEndRow = rMemSrc.fEndRow;
8 rMemDst.fFirstRow = rMemSrc.fFirstRow;
9 rMemDst.fLastRow = rMemSrc.fLastRow;
10 rMemDst.fCurrIH = rMemSrc.fCurrIH;
11 rMemDst.fGo = rMemSrc.fGo;
12 rMemDst.fStage = rMemSrc.fStage;
13 rMemDst.fNHits = rMemSrc.fNHits;
14 rMemDst.fNMissed = rMemSrc.fNMissed;
15 rMemDst.fLastY = rMemSrc.fLastY;
16 rMemDst.fLastZ = rMemSrc.fLastZ;
18 #if defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER) & !defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER_SIMPLE)
19 rMemDst.fItr = rMemSrc.fItr;
20 rMemDst.fIRow = rMemSrc.fIRow;
21 rMemDst.fIRowEnd = rMemSrc.fIRowEnd;
24 tParamDst.SetSinPhi( tParamSrc.GetSinPhi() );
25 tParamDst.SetDzDs( tParamSrc.GetDzDs() );
26 tParamDst.SetQPt( tParamSrc.GetQPt() );
27 tParamDst.SetSignCosPhi( tParamSrc.GetSignCosPhi() );
28 tParamDst.SetChi2( tParamSrc.GetChi2() );
29 tParamDst.SetNDF( tParamSrc.GetNDF() );
30 tParamDst.SetCov( 0, tParamSrc.GetCov(0) );
31 tParamDst.SetCov( 1, tParamSrc.GetCov(1) );
32 tParamDst.SetCov( 2, tParamSrc.GetCov(2) );
33 tParamDst.SetCov( 3, tParamSrc.GetCov(3) );
34 tParamDst.SetCov( 4, tParamSrc.GetCov(4) );
35 tParamDst.SetCov( 5, tParamSrc.GetCov(5) );
36 tParamDst.SetCov( 6, tParamSrc.GetCov(6) );
37 tParamDst.SetCov( 7, tParamSrc.GetCov(7) );
38 tParamDst.SetCov( 8, tParamSrc.GetCov(8) );
39 tParamDst.SetCov( 9, tParamSrc.GetCov(9) );
40 tParamDst.SetCov( 10, tParamSrc.GetCov(10) );
41 tParamDst.SetCov( 11, tParamSrc.GetCov(11) );
42 tParamDst.SetCov( 12, tParamSrc.GetCov(12) );
43 tParamDst.SetCov( 13, tParamSrc.GetCov(13) );
44 tParamDst.SetCov( 14, tParamSrc.GetCov(14) );
45 tParamDst.SetX( tParamSrc.GetX() );
46 tParamDst.SetY( tParamSrc.GetY() );
47 tParamDst.SetZ( tParamSrc.GetZ() );
50 #ifndef HLTCA_GPU_ALTERNATIVE_SCHEDULER
51 GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker, GPUshared() MEM_LOCAL(AliHLTTPCCASharedMemory) &sMem, int Reverse, int RowBlock, int &mustInit)
53 //Fetch a new trackled to be processed by this thread
55 int nextTrackletFirstRun = sMem.fNextTrackletFirstRun;
56 if (get_local_id(0) == 0)
58 sMem.fNTracklets = *tracker.NTracklets();
59 if (sMem.fNextTrackletFirstRun)
61 #ifdef HLTCA_GPU_SCHED_FIXED_START
62 const int iSlice = tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + (get_num_groups(0) % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0);
63 const int nSliceBlockOffset = get_num_groups(0) * iSlice / tracker.GPUParametersConst()->fGPUnSlices;
64 const uint2 &nTracklet = tracker.BlockStartingTracklet()[get_group_id(0) - nSliceBlockOffset];
66 sMem.fNextTrackletCount = nTracklet.y;
67 if (sMem.fNextTrackletCount == 0)
69 sMem.fNextTrackletFirstRun = 0;
73 if (tracker.TrackletStartHits()[nTracklet.x].RowIndex() / HLTCA_GPU_SCHED_ROW_STEP != RowBlock)
75 sMem.fNextTrackletCount = 0;
79 sMem.fNextTrackletFirst = nTracklet.x;
82 #endif //HLTCA_GPU_SCHED_FIXED_START
86 const int4 oldPos = *tracker.RowBlockPos(Reverse, RowBlock);
87 const int nFetchTracks = CAMath::Max(CAMath::Min(oldPos.x - oldPos.y, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR), 0);
88 sMem.fNextTrackletCount = nFetchTracks;
89 const int nUseTrack = nFetchTracks ? CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).y, nFetchTracks) : 0;
90 sMem.fNextTrackletFirst = nUseTrack;
92 const int nFillTracks = CAMath::Min(nFetchTracks, nUseTrack + nFetchTracks - (*((volatile int2*) (tracker.RowBlockPos(Reverse, RowBlock)))).x);
95 const int nStartFillTrack = CAMath::AtomicAdd(&(*tracker.RowBlockPos(Reverse, RowBlock)).x, nFillTracks);
96 if (nFillTracks + nStartFillTrack >= HLTCA_GPU_MAX_TRACKLETS)
98 tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_ROWBLOCK_TRACKLET_OVERFLOW;
100 for (int i = 0;i < nFillTracks;i++)
102 tracker.RowBlockTracklets(Reverse, RowBlock)[(nStartFillTrack + i) % HLTCA_GPU_MAX_TRACKLETS] = -(get_group_id(0) * 1000000 + nFetchTracks * 10000 + oldPos.x * 100 + oldPos.y); //Dummy filling track
109 if (sMem.fNextTrackletCount == 0)
111 return(-2); //No more track in this RowBlock
113 else if (get_local_id(0) >= sMem.fNextTrackletCount)
115 return(-1); //No track in this RowBlock for this thread
117 else if (nextTrackletFirstRun)
119 if (get_local_id(0) == 0) sMem.fNextTrackletFirstRun = 0;
121 return(sMem.fNextTrackletFirst + get_local_id(0));
125 const int nTrackPos = sMem.fNextTrackletFirst + get_local_id(0);
126 mustInit = (nTrackPos < tracker.RowBlockPos(Reverse, RowBlock)->w);
127 volatile int* const ptrTracklet = &tracker.RowBlockTracklets(Reverse, RowBlock)[nTrackPos % HLTCA_GPU_MAX_TRACKLETS];
130 while ((nTracklet = *ptrTracklet) == -1)
132 for (int i = 0;i < 20000;i++)
133 sMem.fNextTrackletStupidDummy++;
137 tracker.GPUParameters()->fGPUError = HLTCA_GPU_ERROR_SCHEDULE_COLLISION;
145 MEM_CLASS_PRE2 GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(MEM_LG2(AliHLTTPCCATracker) *pTracker, GPUsharedref() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory)& sMem)
147 //Main Tracklet construction function that calls the scheduled (FetchTracklet) and then Processes the tracklet (mainly UpdataTracklet) and at the end stores the tracklet.
148 //Can also dispatch a tracklet to be rescheduled
149 #ifdef HLTCA_GPU_EMULATION_SINGLE_TRACKLET
150 pTracker[0].BlockStartingTracklet()[0].x = HLTCA_GPU_EMULATION_SINGLE_TRACKLET;
151 pTracker[0].BlockStartingTracklet()[0].y = 1;
152 for (int i = 1;i < get_num_groups(0);i++)
154 pTracker[0].BlockStartingTracklet()[i].x = pTracker[0].BlockStartingTracklet()[i].y = 0;
156 #endif //HLTCA_GPU_EMULATION_SINGLE_TRACKLET
158 //GPUshared() AliHLTTPCCASharedMemory sMem;
160 #ifdef HLTCA_GPU_SCHED_FIXED_START
161 if (get_local_id(0) == 0)
163 sMem.fNextTrackletFirstRun = 1;
166 #endif //HLTCA_GPU_SCHED_FIXED_START
168 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
169 if (get_local_id(0) == 0)
174 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
176 for (int iReverse = 0;iReverse < 2;iReverse++)
178 for (volatile int iRowBlock = 0;iRowBlock < HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1;iRowBlock++)
180 #ifdef HLTCA_GPU_SCHED_FIXED_SLICE
181 int iSlice = pTracker[0].GPUParametersConst()->fGPUnSlices * (get_group_id(0) + (get_num_groups(0) % pTracker[0].GPUParametersConst()->fGPUnSlices != 0 && pTracker[0].GPUParametersConst()->fGPUnSlices * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0);
183 for (int iSlice = 0;iSlice < pTracker[0].GPUParametersConst()->fGPUnSlices;iSlice++)
184 #endif //HLTCA_GPU_SCHED_FIXED_SLICE
186 AliHLTTPCCATracker &tracker = pTracker[iSlice];
187 if (get_group_id(0) != 7 && sMem.fNextTrackletFirstRun && iSlice != (tracker.GPUParametersConst()->fGPUnSlices > get_num_groups(0) ? get_group_id(0) : (tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + (get_num_groups(0) % tracker.GPUParametersConst()->fGPUnSlices != 0 && tracker.GPUParametersConst()->fGPUnSlices * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0))))
192 int sharedRowsInitialized = 0;
196 while ((iTracklet = FetchTracklet(tracker, sMem, iReverse, iRowBlock, mustInit)) != -2)
198 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
199 CAMath::AtomicMaxShared(&sMem.fMaxSync, threadSync);
201 threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / get_local_size(0) / get_num_groups(0));
202 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
203 if (!sharedRowsInitialized)
205 for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += get_local_size(0))
207 reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker.SliceDataRows())[i];
209 sharedRowsInitialized = 1;
211 #ifdef HLTCA_GPU_RESCHED
212 short2 storeToRowBlock;
213 int storePosition = 0;
214 if (get_local_id(0) < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
216 const int nReverse = get_local_id(0) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
217 const int nRowBlock = get_local_id(0) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
218 sMem.fTrackletStoreCount[nReverse][nRowBlock] = 0;
222 #endif //HLTCA_GPU_RESCHED
224 AliHLTTPCCATrackParam tParam;
225 AliHLTTPCCAThreadMemory rMem;
227 #ifdef HLTCA_GPU_EMULATION_DEBUG_TRACKLET
228 if (iTracklet == HLTCA_GPU_EMULATION_DEBUG_TRACKLET)
230 tracker.GPUParameters()->fGPUError = 1;
232 #endif //HLTCA_GPU_EMULATION_DEBUG_TRACKLET
233 AliHLTTPCCAThreadMemory &rMemGlobal = tracker.GPUTrackletTemp()[iTracklet].fThreadMem;
234 AliHLTTPCCATrackParam &tParamGlobal = tracker.GPUTrackletTemp()[iTracklet].fParam;
237 AliHLTTPCCAHitId id = tracker.TrackletStartHits()[iTracklet];
239 rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex();
240 rMem.fCurrIH = id.HitIndex();
245 AliHLTTPCCATrackletConstructor::InitTracklet(tParam);
247 else if (iTracklet >= 0)
249 CopyTrackletTempData( rMemGlobal, rMem, tParamGlobal, tParam );
251 rMem.fItr = iTracklet;
252 rMem.fGo = (iTracklet >= 0);
254 #ifdef HLTCA_GPU_RESCHED
255 storeToRowBlock.x = iRowBlock + 1;
256 storeToRowBlock.y = iReverse;
259 for (int j = HLTCA_ROW_COUNT - 1 - iRowBlock * HLTCA_GPU_SCHED_ROW_STEP;j >= CAMath::Max(0, HLTCA_ROW_COUNT - (iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP);j--)
261 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
262 if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(j >= rMem.fEndRow || ( j >= rMem.fStartRow && j - rMem.fStartRow % 2 == 0)))
263 pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1;
264 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
267 UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j);
268 if (rMem.fNMissed > kMaxRowGap && j <= rMem.fStartRow)
276 if (iTracklet >= 0 && (!rMem.fGo || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP))
278 StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam );
283 for (int j = CAMath::Max(1, iRowBlock * HLTCA_GPU_SCHED_ROW_STEP);j < CAMath::Min((iRowBlock + 1) * HLTCA_GPU_SCHED_ROW_STEP, HLTCA_ROW_COUNT);j++)
285 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
286 if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && j >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (j - rMem.fStartRow) % 2 == 0 ))
287 pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1;
288 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
291 UpdateTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j);
292 //if (rMem.fNMissed > kMaxRowGap || rMem.fGo == 0) break; //DR!!! CUDA Crashes with this enabled
295 if (rMem.fGo && (rMem.fNMissed > kMaxRowGap || iRowBlock == HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP))
297 if ( !tParam.TransportToX( sMem.fRows[ rMem.fEndRow ].X(), tracker.Param().ConstBz(), .999 ) )
303 storeToRowBlock.x = (HLTCA_ROW_COUNT - rMem.fEndRow) / HLTCA_GPU_SCHED_ROW_STEP;
304 storeToRowBlock.y = 1;
310 if (iTracklet >= 0 && !rMem.fGo)
312 StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam );
316 if (rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0))
318 CopyTrackletTempData( rMem, rMemGlobal, tParam, tParamGlobal );
319 storePosition = CAMath::AtomicAddShared(&sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x], 1);
323 if (get_local_id(0) < 2 * (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1))
325 const int nReverse = get_local_id(0) / (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
326 const int nRowBlock = get_local_id(0) % (HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP + 1);
327 if (sMem.fTrackletStoreCount[nReverse][nRowBlock])
329 sMem.fTrackletStoreCount[nReverse][nRowBlock] = CAMath::AtomicAdd(&tracker.RowBlockPos(nReverse, nRowBlock)->x, sMem.fTrackletStoreCount[nReverse][nRowBlock]);
333 if (iTracklet >= 0 && rMem.fGo && (iRowBlock != HLTCA_ROW_COUNT / HLTCA_GPU_SCHED_ROW_STEP || iReverse == 0))
335 tracker.RowBlockTracklets(storeToRowBlock.y, storeToRowBlock.x)[sMem.fTrackletStoreCount[storeToRowBlock.y][storeToRowBlock.x] + storePosition] = iTracklet;
339 if (get_local_id(0) % HLTCA_GPU_WARP_SIZE == 0)
341 sMem.fStartRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 160;
342 sMem.fEndRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 0;
347 CAMath::AtomicMinShared(&sMem.fStartRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fStartRow);
352 for (int j = sMem.fStartRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++)
354 UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j);
355 if (!rMem.fGo) break;
362 if ( !tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999 ) ) rMem.fGo = 0;
364 CAMath::AtomicMaxShared(&sMem.fEndRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fEndRow);
370 for (int j = rMem.fEndRow;j >= 0;j--)
372 if (!rMem.fGo) break;
373 UpdateTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, j);
376 StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam );
378 #endif //HLTCA_GPU_RESCHED
385 GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit(int iTracklet, AliHLTTPCCATracker &tracker)
387 //Initialize Row Blocks
389 #ifndef HLTCA_GPU_EMULATION_SINGLE_TRACKLET
390 AliHLTTPCCAHitId id = tracker.TrackletStartHits()[iTracklet];
391 #ifdef HLTCA_GPU_SCHED_FIXED_START
392 const int firstDynamicTracklet = tracker.GPUParameters()->fScheduleFirstDynamicTracklet;
393 if (iTracklet >= firstDynamicTracklet)
394 #endif //HLTCA_GPU_SCHED_FIXED_START
396 #ifdef HLTCA_GPU_SCHED_FIXED_START
397 const int firstTrackletInRowBlock = CAMath::Max(firstDynamicTracklet, tracker.RowStartHitCountOffset()[CAMath::Max(id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP * HLTCA_GPU_SCHED_ROW_STEP, 1)].z);
399 const int firstTrackletInRowBlock = tracker.RowStartHitCountOffset()[CAMath::Max(id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP * HLTCA_GPU_SCHED_ROW_STEP, 1)].z;
400 #endif //HLTCA_GPU_SCHED_FIXED_START
402 if (iTracklet == firstTrackletInRowBlock)
404 const int firstRowInNextBlock = (id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP + 1) * HLTCA_GPU_SCHED_ROW_STEP;
405 int trackletsInRowBlock;
406 if (firstRowInNextBlock >= HLTCA_ROW_COUNT - 3)
407 trackletsInRowBlock = *tracker.NTracklets() - firstTrackletInRowBlock;
409 #ifdef HLTCA_GPU_SCHED_FIXED_START
410 trackletsInRowBlock = CAMath::Max(firstDynamicTracklet, tracker.RowStartHitCountOffset()[firstRowInNextBlock].z) - firstTrackletInRowBlock;
412 trackletsInRowBlock = tracker.RowStartHitCountOffset()[firstRowInNextBlock].z - firstTrackletInRowBlock;
413 #endif //HLTCA_GPU_SCHED_FIXED_START
415 tracker.RowBlockPos(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)->x = trackletsInRowBlock;
416 tracker.RowBlockPos(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)->w = trackletsInRowBlock;
418 tracker.RowBlockTracklets(0, id.RowIndex() / HLTCA_GPU_SCHED_ROW_STEP)[iTracklet - firstTrackletInRowBlock] = iTracklet;
420 #endif //!HLTCA_GPU_EMULATION_SINGLE_TRACKLET
423 GPUg() void AliHLTTPCCATrackletConstructorInit(int iSlice)
425 //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit
426 AliHLTTPCCATracker &tracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[iSlice];
427 int i = get_global_id(0);
428 if (i >= *tracker.NTracklets()) return;
429 AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorInit(i, tracker);
432 #elif defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER_SIMPLE)
434 GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker, GPUsharedref() MEM_LOCAL(AliHLTTPCCASharedMemory) &sMem, AliHLTTPCCAThreadMemory& /*rMem*/, MEM_PLAIN(AliHLTTPCCATrackParam)& /*tParam*/)
436 const int nativeslice = get_group_id(0) % tracker.GPUParametersConst()->fGPUnSlices;
437 const int nTracklets = *tracker.NTracklets();
439 if (sMem.fNextTrackletFirstRun == 1)
441 if (get_local_id(0) == 0)
443 sMem.fNextTrackletFirst = (get_group_id(0) - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;
444 sMem.fNextTrackletFirstRun = 0;
449 if (get_local_id(0) == 0)
451 if (tracker.GPUParameters()->fNextTracklet < nTracklets)
453 const int firstTracklet = CAMath::AtomicAdd(&tracker.GPUParameters()->fNextTracklet, HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR);
454 if (firstTracklet < nTracklets) sMem.fNextTrackletFirst = firstTracklet;
455 else sMem.fNextTrackletFirst = -2;
459 sMem.fNextTrackletFirst = -2;
464 return (sMem.fNextTrackletFirst);
467 GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) *pTracker, GPUsharedref() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory)& sMem)
469 const int nSlices = pTracker[0].GPUParametersConst()->fGPUnSlices;
470 const int nativeslice = get_group_id(0) % nSlices;
471 int currentSlice = -1;
475 sMem.fNextTrackletFirstRun = 1;
478 for (int iSlice = 0;iSlice < nSlices;iSlice++)
480 GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker = pTracker[(nativeslice + iSlice) % nSlices];
483 MEM_PLAIN(AliHLTTPCCATrackParam) tParam;
484 AliHLTTPCCAThreadMemory rMem;
487 while ((tmpTracklet = FetchTracklet(tracker, sMem, rMem, tParam)) != -2)
489 if (tmpTracklet >= 0)
491 rMem.fItr = tmpTracklet + get_local_id(0);
498 if (iSlice != currentSlice)
500 if (get_local_id(0) == 0)
502 sMem.fNTracklets = *tracker.NTracklets();
505 for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(MEM_PLAIN(AliHLTTPCCARow)) / sizeof(int);i += get_local_size(0))
507 reinterpret_cast<GPUsharedref() int*>(&sMem.fRows)[i] = reinterpret_cast<GPUglobalref() int*>(tracker.SliceDataRows())[i];
509 currentSlice = iSlice;
513 if (rMem.fItr < sMem.fNTracklets)
515 AliHLTTPCCAHitId id = tracker.TrackletStartHits()[rMem.fItr];
517 rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex();
518 rMem.fCurrIH = id.HitIndex();
523 AliHLTTPCCATrackletConstructor::InitTracklet(tParam);
528 iRow = rMem.fStartRow;
529 iRowEnd = tracker.Param().NRows();
534 rMem.fStartRow = rMem.fEndRow = 0;
539 for (int k = 0;k < 2;k++)
541 for (;iRow != iRowEnd;iRow += rMem.fStage == 2 ? -1 : 1)
543 UpdateTracklet(0, 0, 0, 0, sMem, rMem, tracker, tParam, iRow);
546 if (rMem.fStage == 2)
548 if (rMem.fItr < sMem.fNTracklets)
550 StoreTracklet( 0, 0, 0, 0, sMem, rMem, tracker, tParam );
557 if (rMem.fGo) if (!tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999)) rMem.fGo = 0;
567 #else //HLTCA_GPU_ALTERNATIVE_SCHEDULER
569 GPUdi() int AliHLTTPCCATrackletConstructor::FetchTracklet(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker, GPUsharedref() MEM_LOCAL(AliHLTTPCCASharedMemory) &sMem, AliHLTTPCCAThreadMemory &rMem, MEM_PLAIN(AliHLTTPCCATrackParam) &tParam)
571 const int nativeslice = get_group_id(0) % tracker.GPUParametersConst()->fGPUnSlices;
572 const int nTracklets = *tracker.NTracklets();
574 if (get_local_id(0) == 0) sMem.fTrackletStorePos = 0;
576 if (sMem.fNextTrackletFirstRun == 1)
578 if (get_local_id(0) == 0)
580 sMem.fNextTrackletFirst = (get_group_id(0) - nativeslice) / tracker.GPUParametersConst()->fGPUnSlices * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;
581 sMem.fNextTrackletFirstRun = 0;
582 sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;
587 if (sMem.fNextTrackletCount < HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR - HLTCA_GPU_ALTSCHED_MIN_THREADS)
589 if (get_local_id(0) == 0)
591 sMem.fNextTrackletFirst = -1;
599 nStorePos = CAMath::AtomicAddShared(&sMem.fTrackletStorePos, 1);
600 CopyTrackletTempData(rMem, sMem.swapMemory[nStorePos].fThreadMem, tParam, sMem.swapMemory[nStorePos].fParam);
603 if (get_local_id(0) == 0)
605 if (tracker.GPUParameters()->fNextTracklet >= nTracklets)
607 sMem.fNextTrackletFirst = -1;
611 const int firstTracklet = CAMath::AtomicAdd(&tracker.GPUParameters()->fNextTracklet, sMem.fNextTrackletCount);
612 if (firstTracklet >= nTracklets)
614 sMem.fNextTrackletFirst = -1;
618 sMem.fNextTrackletFirst = firstTracklet;
625 if (get_local_id(0) == 0)
627 if (sMem.fNextTrackletFirst == -1 && sMem.fNextTrackletCount == HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR)
629 sMem.fNextTrackletFirst = -2;
630 sMem.fNextTrackletCount = HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;
632 else if (sMem.fNextTrackletFirst >= 0)
634 if (sMem.fNextTrackletFirst + sMem.fNextTrackletCount >= nTracklets)
636 sMem.fNextTrackletCount = sMem.fNextTrackletFirst + sMem.fNextTrackletCount - nTracklets;
640 sMem.fNextTrackletCount = 0;
645 if (get_local_id(0) < sMem.fTrackletStorePos)
647 CopyTrackletTempData(sMem.swapMemory[get_local_id(0)].fThreadMem, rMem, sMem.swapMemory[get_local_id(0)].fParam, tParam);
649 return (sMem.fNextTrackletFirst);
652 GPUdi() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) *pTracker, GPUsharedref() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory)& sMem)
654 const int nSlices = pTracker[0].GPUParametersConst()->fGPUnSlices;
655 const int nativeslice = get_group_id(0) % nSlices;
656 //GPUshared() AliHLTTPCCASharedMemory sMem;
657 int currentSlice = -1;
661 sMem.fNextTrackletFirstRun = 1;
664 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
665 if (get_local_id(0) == 0)
670 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
672 for (int iSlice = 0;iSlice < nSlices;iSlice++)
674 GPUconstant() MEM_CONSTANT(AliHLTTPCCATracker) &tracker = pTracker[(nativeslice + iSlice) % nSlices];
676 MEM_PLAIN(AliHLTTPCCATrackParam) tParam;
677 AliHLTTPCCAThreadMemory rMem;
681 while ((tmpTracklet = FetchTracklet(tracker, sMem, rMem, tParam)) != -2)
684 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
685 CAMath::AtomicMaxShared(&sMem.fMaxSync, threadSync);
687 threadSync = CAMath::Min(sMem.fMaxSync, 100000000 / get_local_size(0) / get_num_groups(0));
688 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
690 if (iSlice != currentSlice)
692 if (get_local_id(0) == 0) sMem.fNTracklets = *tracker.NTracklets();
694 for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(MEM_PLAIN(AliHLTTPCCARow)) / sizeof(int);i += get_local_size(0))
696 reinterpret_cast<GPUsharedref() int*>(&sMem.fRows)[i] = reinterpret_cast<GPUglobalref() int*>(tracker.SliceDataRows())[i];
698 currentSlice = iSlice;
702 if (tmpTracklet >= 0 && rMem.fItr < 0)
704 rMem.fItr = tmpTracklet + (signed) get_local_id(0) - sMem.fTrackletStorePos;
705 if (rMem.fItr >= sMem.fNTracklets)
711 AliHLTTPCCAHitId id = tracker.TrackletStartHits()[rMem.fItr];
713 rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex();
714 rMem.fCurrIH = id.HitIndex();
719 AliHLTTPCCATrackletConstructor::InitTracklet(tParam);
723 rMem.fIRow = rMem.fStartRow;
724 rMem.fIRowEnd = tracker.Param().NRows();
730 for (int j = 0;j < HLTCA_GPU_ALTSCHED_STEPSIZE && rMem.fIRow != rMem.fIRowEnd;j++,rMem.fIRow += rMem.fStage == 2 ? -1 : 1)
732 #ifdef HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
733 if (rMem.fStage == 2)
735 if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && !(rMem.fIRow >= rMem.fEndRow || ( rMem.fIRow >= rMem.fStartRow && rMem.fIRow - rMem.fStartRow % 2 == 0)))
736 pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1;
740 if (rMem.fNMissed <= kMaxRowGap && rMem.fGo && rMem.fIRow >= rMem.fStartRow && (rMem.fStage > 0 || rMem.fCurrIH >= 0 || (rMem.fIRow - rMem.fStartRow) % 2 == 0 ))
741 pTracker[0].StageAtSync()[threadSync++ * get_global_size(0) + get_global_id(0)] = rMem.fStage + 1;
743 #endif //HLTCA_GPU_TRACKLET_CONSTRUCTOR_DO_PROFILE
744 UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam, rMem.fIRow);
747 if (rMem.fIRow == rMem.fIRowEnd || rMem.fNMissed > kMaxRowGap)
749 if (rMem.fStage >= 2)
757 if (!tParam.TransportToX( tracker.Row( rMem.fEndRow ).X(), tracker.Param().ConstBz(), .999)) rMem.fGo = 0;
758 rMem.fIRow = rMem.fEndRow;
765 StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, tracker, tParam );
767 CAMath::AtomicAddShared(&sMem.fNextTrackletCount, 1);
774 #endif //HLTCA_GPU_ALTERNATIVE_SCHEDULER
777 GPUg() void AliHLTTPCCATrackletConstructorGPU()
779 //GPU Wrapper for AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU
780 AliHLTTPCCATracker *pTracker = ( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker );
781 GPUshared() AliHLTTPCCATrackletConstructor::MEM_LOCAL(AliHLTTPCCASharedMemory) sMem;
782 AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPU(pTracker, sMem);
785 GPUg() void AliHLTTPCCATrackletConstructorGPUPP(int firstSlice, int sliceCount)
787 if (get_group_id(0) >= sliceCount) return;
788 AliHLTTPCCATracker *pTracker = &( ( AliHLTTPCCATracker* ) gAliHLTTPCCATracker )[firstSlice + get_group_id(0)];
789 AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP(pTracker);
792 GPUd() void AliHLTTPCCATrackletConstructor::AliHLTTPCCATrackletConstructorGPUPP(AliHLTTPCCATracker *tracker)
794 GPUshared() AliHLTTPCCASharedMemory sMem;
795 #if defined(HLTCA_GPU_RESCHED) & !defined(HLTCA_GPU_ALTERNATIVE_SCHEDULER)
796 #define startRows sMem.fStartRows
797 #define endRows sMem.fEndRows
799 GPUshared() int startRows[HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / HLTCA_GPU_WARP_SIZE + 1];
800 GPUshared() int endRows[HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR / HLTCA_GPU_WARP_SIZE + 1];
802 sMem.fNTracklets = *tracker->NTracklets();
804 for (int i = get_local_id(0);i < HLTCA_ROW_COUNT * sizeof(AliHLTTPCCARow) / sizeof(int);i += get_local_size(0))
806 reinterpret_cast<int*>(&sMem.fRows)[i] = reinterpret_cast<int*>(tracker->SliceDataRows())[i];
809 for (int iTracklet = get_local_id(0);iTracklet < (*tracker->NTracklets() / HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR + 1) * HLTCA_GPU_THREAD_COUNT_CONSTRUCTOR;iTracklet += get_local_size(0))
811 AliHLTTPCCATrackParam tParam;
812 AliHLTTPCCAThreadMemory rMem;
814 if (iTracklet < *tracker->NTracklets())
816 AliHLTTPCCAHitId id = tracker->TrackletTmpStartHits()[iTracklet];
818 rMem.fStartRow = rMem.fEndRow = rMem.fFirstRow = rMem.fLastRow = id.RowIndex();
819 rMem.fCurrIH = id.HitIndex();
824 AliHLTTPCCATrackletConstructor::InitTracklet(tParam);
826 rMem.fItr = iTracklet;
830 if (get_local_id(0) % HLTCA_GPU_WARP_SIZE == 0)
832 startRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 160;
833 endRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE] = 0;
836 if (iTracklet < *tracker->NTracklets())
838 CAMath::AtomicMinShared(&startRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fStartRow);
841 if (iTracklet < *tracker->NTracklets())
843 for (int j = startRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE];j < HLTCA_ROW_COUNT;j++)
845 UpdateTracklet(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, *tracker, tParam, j);
846 if (!rMem.fGo) break;
853 if ( !tParam.TransportToX( tracker->Row( rMem.fEndRow ).X(), tracker->Param().ConstBz(), .999 ) ) rMem.fGo = 0;
855 CAMath::AtomicMaxShared(&endRows[get_local_id(0) / HLTCA_GPU_WARP_SIZE], rMem.fEndRow);
859 if (iTracklet < *tracker->NTracklets())
861 for (int j = rMem.fEndRow;j >= 0;j--)
863 if (!rMem.fGo) break;
864 UpdateTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, *tracker, tParam, j);
866 StoreTracklet( get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), sMem, rMem, *tracker, tParam );