]> git.uio.no Git - u/mrichter/AliRoot.git/blame - Vc/src/avx_sorthelper.cpp
Resolving all symbols in the library
[u/mrichter/AliRoot.git] / Vc / src / avx_sorthelper.cpp
CommitLineData
f22341db 1/* This file is part of the Vc library.
2
3 Copyright (C) 2011 Matthias Kretz <kretz@kde.org>
4
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
9
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17
18*/
19
7c616f25 20#include <Vc/avx/intrinsics.h>
21#include <Vc/avx/casts.h>
22#include <Vc/avx/sorthelper.h>
23#include <Vc/avx/macros.h>
f22341db 24
c017a39f 25namespace AliRoot {
f22341db 26namespace Vc
27{
28namespace AVX
29{
30
c017a39f 31template<> m128i SortHelper<short>::sort(VTArg _x)
f22341db 32{
c017a39f 33 m128i lo, hi, y, x = _x;
f22341db 34 // sort pairs
35 y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
36 lo = _mm_min_epi16(x, y);
37 hi = _mm_max_epi16(x, y);
38 x = _mm_blend_epi16(lo, hi, 0xaa);
39
40 // merge left and right quads
41 y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3));
42 lo = _mm_min_epi16(x, y);
43 hi = _mm_max_epi16(x, y);
44 x = _mm_blend_epi16(lo, hi, 0xcc);
45 y = _mm_srli_si128(x, 2);
46 lo = _mm_min_epi16(x, y);
47 hi = _mm_max_epi16(x, y);
48 x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa);
49
50 // merge quads into octs
51 y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
52 y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3));
53 lo = _mm_min_epi16(x, y);
54 hi = _mm_max_epi16(x, y);
55
56 x = _mm_unpacklo_epi16(lo, hi);
57 y = _mm_srli_si128(x, 8);
58 lo = _mm_min_epi16(x, y);
59 hi = _mm_max_epi16(x, y);
60
61 x = _mm_unpacklo_epi16(lo, hi);
62 y = _mm_srli_si128(x, 8);
63 lo = _mm_min_epi16(x, y);
64 hi = _mm_max_epi16(x, y);
65
66 return _mm_unpacklo_epi16(lo, hi);
67}
c017a39f 68template<> m128i SortHelper<unsigned short>::sort(VTArg _x)
f22341db 69{
c017a39f 70 m128i lo, hi, y, x = _x;
f22341db 71 // sort pairs
72 y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
73 lo = _mm_min_epu16(x, y);
74 hi = _mm_max_epu16(x, y);
75 x = _mm_blend_epi16(lo, hi, 0xaa);
76
77 // merge left and right quads
78 y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3));
79 lo = _mm_min_epu16(x, y);
80 hi = _mm_max_epu16(x, y);
81 x = _mm_blend_epi16(lo, hi, 0xcc);
82 y = _mm_srli_si128(x, 2);
83 lo = _mm_min_epu16(x, y);
84 hi = _mm_max_epu16(x, y);
85 x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa);
86
87 // merge quads into octs
88 y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
89 y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3));
90 lo = _mm_min_epu16(x, y);
91 hi = _mm_max_epu16(x, y);
92
93 x = _mm_unpacklo_epi16(lo, hi);
94 y = _mm_srli_si128(x, 8);
95 lo = _mm_min_epu16(x, y);
96 hi = _mm_max_epu16(x, y);
97
98 x = _mm_unpacklo_epi16(lo, hi);
99 y = _mm_srli_si128(x, 8);
100 lo = _mm_min_epu16(x, y);
101 hi = _mm_max_epu16(x, y);
102
103 return _mm_unpacklo_epi16(lo, hi);
104}
105
c017a39f 106template<> m256i SortHelper<int>::sort(VTArg _hgfedcba)
f22341db 107{
c017a39f 108 VectorType hgfedcba = _hgfedcba;
109 const m128i hgfe = hi128(hgfedcba);
110 const m128i dcba = lo128(hgfedcba);
111 m128i l = _mm_min_epi32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
112 m128i h = _mm_max_epi32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
f22341db 113
c017a39f 114 m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea
115 m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc
f22341db 116
117 l = _mm_min_epi32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
118 h = _mm_max_epi32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
119
120 x = _mm_min_epi32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
121 y = _mm_max_epi32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
122
c017a39f 123 m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
124 m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0
f22341db 125
79c86c14 126 // _mm_extract_epi32 from clang < 3.4 returns an unsigned int - the static_cast is free for
127 // conforming compilers, but fixes broken ones
128 if (VC_IS_UNLIKELY(static_cast<int>(_mm_extract_epi32(x, 2)) >= static_cast<int>(_mm_extract_epi32(y, 1)))) {
f22341db 129 return concat(Reg::permute<X0, X1, X2, X3>(b), a);
79c86c14 130 } else if (VC_IS_UNLIKELY(static_cast<int>(_mm_extract_epi32(x, 0)) >= static_cast<int>(_mm_extract_epi32(y, 3)))) {
f22341db 131 return concat(a, Reg::permute<X0, X1, X2, X3>(b));
132 }
133
134 // merge
135 l = _mm_min_epi32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
136 h = _mm_max_epi32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
137
138 a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
139 b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
140 l = _mm_min_epi32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
141 h = _mm_max_epi32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
142
143 a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
144 b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
145 l = _mm_min_epi32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
146 h = _mm_max_epi32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
147
148 return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h));
149}
150
c017a39f 151template<> m256i SortHelper<unsigned int>::sort(VTArg _hgfedcba)
f22341db 152{
c017a39f 153 VectorType hgfedcba = _hgfedcba;
154 const m128i hgfe = hi128(hgfedcba);
155 const m128i dcba = lo128(hgfedcba);
156 m128i l = _mm_min_epu32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
157 m128i h = _mm_max_epu32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
f22341db 158
c017a39f 159 m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea
160 m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc
f22341db 161
162 l = _mm_min_epu32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
163 h = _mm_max_epu32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
164
165 x = _mm_min_epu32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
166 y = _mm_max_epu32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
167
c017a39f 168 m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
169 m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0
f22341db 170
171 if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 2) >= _mm_extract_epu32(y, 1))) {
172 return concat(Reg::permute<X0, X1, X2, X3>(b), a);
173 } else if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 0) >= _mm_extract_epu32(y, 3))) {
174 return concat(a, Reg::permute<X0, X1, X2, X3>(b));
175 }
176
177 // merge
178 l = _mm_min_epu32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
179 h = _mm_max_epu32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
180
181 a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
182 b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
183 l = _mm_min_epu32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
184 h = _mm_max_epu32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
185
186 a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
187 b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
188 l = _mm_min_epu32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
189 h = _mm_max_epu32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
190
191 return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h));
192}
193
c017a39f 194template<> m256 SortHelper<float>::sort(VTArg _hgfedcba)
f22341db 195{
c017a39f 196 VectorType hgfedcba = _hgfedcba;
197 const m128 hgfe = hi128(hgfedcba);
198 const m128 dcba = lo128(hgfedcba);
199 m128 l = _mm_min_ps(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
200 m128 h = _mm_max_ps(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
f22341db 201
c017a39f 202 m128 x = _mm_unpacklo_ps(l, h); // ↑fb ↓fb ↑ea ↓ea
203 m128 y = _mm_unpackhi_ps(l, h); // ↑hd ↓hd ↑gc ↓gc
f22341db 204
205 l = _mm_min_ps(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
206 h = _mm_max_ps(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
207
208 x = _mm_min_ps(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
209 y = _mm_max_ps(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
210
c017a39f 211 m128 a = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(x), _mm_castps_pd(y))); // a3 >= a2 >= a1 >= a0
212 m128 b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
f22341db 213
214 // merge
215 l = _mm_min_ps(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
216 h = _mm_max_ps(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
217
218 a = _mm_unpacklo_ps(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
219 b = _mm_unpackhi_ps(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
220 l = _mm_min_ps(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
221 h = _mm_max_ps(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
222
223 a = _mm_unpacklo_ps(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
224 b = _mm_unpackhi_ps(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
225 l = _mm_min_ps(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
226 h = _mm_max_ps(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
227
228 return concat(_mm_unpacklo_ps(l, h), _mm_unpackhi_ps(l, h));
229}
230
c017a39f 231template<> m256 SortHelper<sfloat>::sort(VTArg hgfedcba)
f22341db 232{
233 return SortHelper<float>::sort(hgfedcba);
234}
235
c017a39f 236template<> void SortHelper<double>::sort(m256d &VC_RESTRICT x, m256d &VC_RESTRICT y)
f22341db 237{
c017a39f 238 m256d l = _mm256_min_pd(x, y); // ↓x3y3 ↓x2y2 ↓x1y1 ↓x0y0
239 m256d h = _mm256_max_pd(x, y); // ↑x3y3 ↑x2y2 ↑x1y1 ↑x0y0
f22341db 240 x = _mm256_unpacklo_pd(l, h); // ↑x2y2 ↓x2y2 ↑x0y0 ↓x0y0
241 y = _mm256_unpackhi_pd(l, h); // ↑x3y3 ↓x3y3 ↑x1y1 ↓x1y1
242 l = _mm256_min_pd(x, y); // ↓(↑x2y2,↑x3y3) ↓x3x2y3y2 ↓(↑x0y0,↑x1y1) ↓x1x0y1y0
243 h = _mm256_max_pd(x, y); // ↑x3x2y3y2 ↑(↓x2y2,↓x3y3) ↑x1x0y1y0 ↑(↓x0y0,↓x1y1)
244 x = _mm256_unpacklo_pd(l, h); // ↑(↓x2y2,↓x3y3) ↓x3x2y3y2 ↑(↓x0y0,↓x1y1) ↓x1x0y1y0
245 y = _mm256_unpackhi_pd(h, l); // ↓(↑x2y2,↑x3y3) ↑x3x2y3y2 ↓(↑x0y0,↑x1y1) ↑x1x0y1y0
246 l = _mm256_min_pd(x, y); // ↓(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↓x3x2y3y2 ↓(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↓x1x0y1y0
247 h = _mm256_max_pd(x, y); // ↑(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↑x3x2y3y2 ↑(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↑x1x0y1y0
c017a39f 248 m256d a = Reg::permute<X2, X3, X1, X0>(Reg::permute128<X0, X1>(h, h)); // h0 h1 h3 h2
249 m256d b = Reg::permute<X2, X3, X1, X0>(l); // l2 l3 l1 l0
f22341db 250
251 // a3 >= a2 >= b1 >= b0
252 // b3 <= b2 <= a1 <= a0
253
254 // merge
255 l = _mm256_min_pd(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
256 h = _mm256_min_pd(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
257
258 x = _mm256_unpacklo_pd(l, h); // ↑a2b2 ↓a2b2 ↑a0b0 ↓a0b0
259 y = _mm256_unpackhi_pd(l, h); // ↑a3b3 ↓a3b3 ↑a1b1 ↓a1b1
260 l = _mm256_min_pd(x, y); // ↓(↑a2b2,↑a3b3) ↓a2b3 ↓(↑a0b0,↑a1b1) ↓a1b0
261 h = _mm256_min_pd(x, y); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↑a0b1 ↑(↓a0b0,↓a1b1)
262
263 x = Reg::permute128<Y0, X0>(l, h); // ↑a0b1 ↑(↓a0b0,↓a1b1) ↓(↑a0b0,↑a1b1) ↓a1b0
264 y = Reg::permute128<Y1, X1>(l, h); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↓(↑a2b2,↑a3b3) ↓a2b3
265 l = _mm256_min_pd(x, y); // ↓(↑a0b1,↑a3b2) ↓(↑(↓a0b0,↓a1b1) ↑(↓a2b2,↓a3b3)) ↓(↑a0b0,↑a1b1,↑a2b2,↑a3b3) ↓b0b3
266 h = _mm256_min_pd(x, y); // ↑a0a3 ↑(↓a0b0,↓a1b1,↓a2b2,↓a3b3) ↑(↓(↑a0b0,↑a1b1) ↓(↑a2b2,↑a3b3)) ↑(↓a1b0,↓a2b3)
267
268 x = _mm256_unpacklo_pd(l, h); // h2 l2 h0 l0
269 y = _mm256_unpackhi_pd(l, h); // h3 l3 h1 l1
270}
c017a39f 271template<> m256d SortHelper<double>::sort(VTArg _dcba)
f22341db 272{
c017a39f 273 VectorType dcba = _dcba;
f22341db 274 /*
275 * to find the second largest number find
276 * max(min(max(ab),max(cd)), min(max(ad),max(bc)))
277 * or
278 * max(max(min(ab),min(cd)), min(max(ab),max(cd)))
279 *
c017a39f 280 const m256d adcb = avx_cast<m256d>(concat(_mm_alignr_epi8(avx_cast<m128i>(dc), avx_cast<m128i>(ba), 8), _mm_alignr_epi8(avx_cast<m128i>(ba), avx_cast<m128i>(dc), 8)));
281 const m256d l = _mm256_min_pd(dcba, adcb); // min(ad cd bc ab)
282 const m256d h = _mm256_max_pd(dcba, adcb); // max(ad cd bc ab)
f22341db 283 // max(h3, h1)
284 // max(min(h0,h2), min(h3,h1))
285 // min(max(l0,l2), max(l3,l1))
286 // min(l3, l1)
287
c017a39f 288 const m256d ll = _mm256_min_pd(h, Reg::permute128<X0, X1>(h, h)); // min(h3h1 h2h0 h1h3 h0h2)
289 //const m256d hh = _mm256_max_pd(h3 ll1_3 l1 l0, h1 ll0_2 l3 l2);
290 const m256d hh = _mm256_max_pd(
f22341db 291 Reg::permute128<X1, Y0>(_mm256_unpackhi_pd(ll, h), l),
292 Reg::permute128<X0, Y1>(_mm256_blend_pd(h ll, 0x1), l));
293 _mm256_min_pd(hh0, hh1
294 */
295
296 //////////////////////////////////////////////////////////////////////////////////
297 // max(max(ac), max(bd))
298 // max(max(min(ac),min(bd)), min(max(ac),max(bd)))
299 // min(max(min(ac),min(bd)), min(max(ac),max(bd)))
300 // min(min(ac), min(bd))
c017a39f 301 m128d l = _mm_min_pd(lo128(dcba), hi128(dcba)); // min(bd) min(ac)
302 m128d h = _mm_max_pd(lo128(dcba), hi128(dcba)); // max(bd) max(ac)
303 m128d h0_l0 = _mm_unpacklo_pd(l, h);
304 m128d h1_l1 = _mm_unpackhi_pd(l, h);
f22341db 305 l = _mm_min_pd(h0_l0, h1_l1);
306 h = _mm_max_pd(h0_l0, h1_l1);
307 return concat(
308 _mm_min_pd(l, Reg::permute<X0, X0>(h)),
309 _mm_max_pd(h, Reg::permute<X1, X1>(l))
310 );
311 // extract: 1 cycle
312 // min/max: 4 cycles
313 // unpacklo/hi: 2 cycles
314 // min/max: 4 cycles
315 // permute: 1 cycle
316 // min/max: 4 cycles
317 // insert: 1 cycle
318 // ----------------------
319 // total: 17 cycles
320
321 /*
c017a39f 322 m256d cdab = Reg::permute<X2, X3, X0, X1>(dcba);
323 m256d l = _mm256_min_pd(dcba, cdab);
324 m256d h = _mm256_max_pd(dcba, cdab);
325 m256d maxmin_ba = Reg::permute128<X0, Y0>(l, h);
326 m256d maxmin_dc = Reg::permute128<X1, Y1>(l, h);
f22341db 327
328 l = _mm256_min_pd(maxmin_ba, maxmin_dc);
329 h = _mm256_max_pd(maxmin_ba, maxmin_dc);
330
331 return _mm256_blend_pd(h, l, 0x55);
332 */
333
334 /*
335 // a b c d
336 // b a d c
337 // sort pairs
c017a39f 338 m256d y, l, h;
339 m128d l2, h2;
f22341db 340 y = shuffle<X1, Y0, X3, Y2>(x, x);
341 l = _mm256_min_pd(x, y); // min[ab ab cd cd]
342 h = _mm256_max_pd(x, y); // max[ab ab cd cd]
343
344 // 1 of 2 is at [0]
345 // 1 of 4 is at [1]
346 // 1 of 4 is at [2]
347 // 1 of 2 is at [3]
348
349 // don't be fooled by unpack here. It works differently for AVX pd than for SSE ps
350 x = _mm256_unpacklo_pd(l, h); // l_ab h_ab l_cd h_cd
351 l2 = _mm_min_pd(lo128(x), hi128(x)); // l_abcd l(h_ab hcd)
352 h2 = _mm_max_pd(lo128(x), hi128(x)); // h(l_ab l_cd) h_abcd
353
354 // either it is:
355 return concat(l2, h2);
356 // or:
357 // concat(_mm_unpacklo_pd(l2, h2), _mm_unpackhi_pd(l2, h2));
358
359 // I'd like to have four useful compares
c017a39f 360 const m128d dc = hi128(dcba);
361 const m128d ba = lo128(dcba);
362 const m256d adcb = avx_cast<m256d>(concat(_mm_alignr_epi8(avx_cast<m128i>(dc), avx_cast<m128i>(ba), 8), _mm_alignr_epi8(avx_cast<m128i>(ba), avx_cast<m128i>(dc), 8)));
f22341db 363
364 const int extraCmp = _mm_movemask_pd(_mm_cmpgt_pd(dc, ba));
365 // 0x0: d <= b && c <= a
366 // 0x1: d <= b && c > a
367 // 0x2: d > b && c <= a
368 // 0x3: d > b && c > a
369
370 switch (_mm256_movemask_pd(_mm256_cmpgt_pd(dcba, adcb))) {
371 // impossible: 0x0, 0xf
372 case 0x1: // a <= b && b <= c && c <= d && d > a
373 // abcd
374 return Reg::permute<X2, X3, X0, X1>(Reg::permute<X0, X1>(dcba, dcba));
375 case 0x2: // a <= b && b <= c && c > d && d <= a
376 // dabc
377 return Reg::permute<X2, X3, X0, X1>(adcb);
378 case 0x3: // a <= b && b <= c && c > d && d > a
379 // a[bd]c
380 if (extraCmp & 2) {
381 // abdc
382 return Reg::permute<X2, X3, X1, X0>(Reg::permute<X0, X1>(dcba, dcba));
383 } else {
384 // adbc
385 return Reg::permute<X3, X2, X0, X1>(adcb);
386 }
387 case 0x4: // a <= b && b > c && c <= d && d <= a
388 // cdab;
389 return Reg::permute<X2, X3, X0, X1>(dcba);
390 case 0x5: // a <= b && b > c && c <= d && d > a
391 // [ac] < [bd]
392 switch (extraCmp) {
393 case 0x0: // d <= b && c <= a
394 // cadb
395 return shuffle<>(dcba, bcda);
396 case 0x1: // d <= b && c > a
397 case 0x2: // d > b && c <= a
398 case 0x3: // d > b && c > a
399 }
400 case 0x6: // a <= b && b > c && c > d && d <= a
401 // d[ac]b
402 case 0x7: // a <= b && b > c && c > d && d > a
403 // adcb;
404 return permute<X1, X0, X3, X2>(permute128<X1, X0>(bcda, bcda));
405 case 0x8: // a > b && b <= c && c <= d && d <= a
406 return bcda;
407 case 0x9: // a > b && b <= c && c <= d && d > a
408 // b[ac]d;
409 case 0xa: // a > b && b <= c && c > d && d <= a
410 // [ac] > [bd]
411 case 0xb: // a > b && b <= c && c > d && d > a
412 // badc;
413 return permute128<X1, X0>(dcba);
414 case 0xc: // a > b && b > c && c <= d && d <= a
415 // c[bd]a;
416 case 0xd: // a > b && b > c && c <= d && d > a
417 // cbad;
418 return permute<X1, X0, X3, X2>(bcda);
419 case 0xe: // a > b && b > c && c > d && d <= a
420 return dcba;
421 }
422 */
423}
424
425} // namespace AVX
426} // namespace Vc
c017a39f 427} // namespace AliRoot