Vc sources now use completely namespaced includes to work around messed up MacOS X
[u/mrichter/AliRoot.git] / Vc / src / avx / sorthelper.cpp
CommitLineData
f22341db 1/* This file is part of the Vc library.
2
3 Copyright (C) 2011 Matthias Kretz <kretz@kde.org>
4
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
9
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17
18*/
19
7c616f25 20#include <Vc/avx/intrinsics.h>
21#include <Vc/avx/casts.h>
22#include <Vc/avx/sorthelper.h>
23#include <Vc/avx/macros.h>
f22341db 24
25namespace Vc
26{
27namespace AVX
28{
29
30template<> __m128i SortHelper<short>::sort(__m128i x)
31{
32 __m128i lo, hi, y;
33 // sort pairs
34 y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
35 lo = _mm_min_epi16(x, y);
36 hi = _mm_max_epi16(x, y);
37 x = _mm_blend_epi16(lo, hi, 0xaa);
38
39 // merge left and right quads
40 y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3));
41 lo = _mm_min_epi16(x, y);
42 hi = _mm_max_epi16(x, y);
43 x = _mm_blend_epi16(lo, hi, 0xcc);
44 y = _mm_srli_si128(x, 2);
45 lo = _mm_min_epi16(x, y);
46 hi = _mm_max_epi16(x, y);
47 x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa);
48
49 // merge quads into octs
50 y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
51 y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3));
52 lo = _mm_min_epi16(x, y);
53 hi = _mm_max_epi16(x, y);
54
55 x = _mm_unpacklo_epi16(lo, hi);
56 y = _mm_srli_si128(x, 8);
57 lo = _mm_min_epi16(x, y);
58 hi = _mm_max_epi16(x, y);
59
60 x = _mm_unpacklo_epi16(lo, hi);
61 y = _mm_srli_si128(x, 8);
62 lo = _mm_min_epi16(x, y);
63 hi = _mm_max_epi16(x, y);
64
65 return _mm_unpacklo_epi16(lo, hi);
66}
67template<> __m128i SortHelper<unsigned short>::sort(__m128i x)
68{
69 __m128i lo, hi, y;
70 // sort pairs
71 y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
72 lo = _mm_min_epu16(x, y);
73 hi = _mm_max_epu16(x, y);
74 x = _mm_blend_epi16(lo, hi, 0xaa);
75
76 // merge left and right quads
77 y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3));
78 lo = _mm_min_epu16(x, y);
79 hi = _mm_max_epu16(x, y);
80 x = _mm_blend_epi16(lo, hi, 0xcc);
81 y = _mm_srli_si128(x, 2);
82 lo = _mm_min_epu16(x, y);
83 hi = _mm_max_epu16(x, y);
84 x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa);
85
86 // merge quads into octs
87 y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
88 y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3));
89 lo = _mm_min_epu16(x, y);
90 hi = _mm_max_epu16(x, y);
91
92 x = _mm_unpacklo_epi16(lo, hi);
93 y = _mm_srli_si128(x, 8);
94 lo = _mm_min_epu16(x, y);
95 hi = _mm_max_epu16(x, y);
96
97 x = _mm_unpacklo_epi16(lo, hi);
98 y = _mm_srli_si128(x, 8);
99 lo = _mm_min_epu16(x, y);
100 hi = _mm_max_epu16(x, y);
101
102 return _mm_unpacklo_epi16(lo, hi);
103}
104
105template<> __m256i SortHelper<int>::sort(__m256i hgfedcba)
106{
107 const __m128i hgfe = hi128(hgfedcba);
108 const __m128i dcba = lo128(hgfedcba);
109 __m128i l = _mm_min_epi32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
110 __m128i h = _mm_max_epi32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
111
112 __m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea
113 __m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc
114
115 l = _mm_min_epi32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
116 h = _mm_max_epi32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
117
118 x = _mm_min_epi32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
119 y = _mm_max_epi32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
120
121 __m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
122 __m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0
123
124 if (VC_IS_UNLIKELY(_mm_extract_epi32(x, 2) >= _mm_extract_epi32(y, 1))) {
125 return concat(Reg::permute<X0, X1, X2, X3>(b), a);
126 } else if (VC_IS_UNLIKELY(_mm_extract_epi32(x, 0) >= _mm_extract_epi32(y, 3))) {
127 return concat(a, Reg::permute<X0, X1, X2, X3>(b));
128 }
129
130 // merge
131 l = _mm_min_epi32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
132 h = _mm_max_epi32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
133
134 a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
135 b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
136 l = _mm_min_epi32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
137 h = _mm_max_epi32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
138
139 a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
140 b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
141 l = _mm_min_epi32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
142 h = _mm_max_epi32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
143
144 return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h));
145}
146
147template<> __m256i SortHelper<unsigned int>::sort(__m256i hgfedcba)
148{
149 const __m128i hgfe = hi128(hgfedcba);
150 const __m128i dcba = lo128(hgfedcba);
151 __m128i l = _mm_min_epu32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
152 __m128i h = _mm_max_epu32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
153
154 __m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea
155 __m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc
156
157 l = _mm_min_epu32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
158 h = _mm_max_epu32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
159
160 x = _mm_min_epu32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
161 y = _mm_max_epu32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
162
163 __m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
164 __m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0
165
166 if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 2) >= _mm_extract_epu32(y, 1))) {
167 return concat(Reg::permute<X0, X1, X2, X3>(b), a);
168 } else if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 0) >= _mm_extract_epu32(y, 3))) {
169 return concat(a, Reg::permute<X0, X1, X2, X3>(b));
170 }
171
172 // merge
173 l = _mm_min_epu32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
174 h = _mm_max_epu32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
175
176 a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
177 b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
178 l = _mm_min_epu32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
179 h = _mm_max_epu32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
180
181 a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
182 b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
183 l = _mm_min_epu32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
184 h = _mm_max_epu32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
185
186 return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h));
187}
188
189template<> __m256 SortHelper<float>::sort(__m256 hgfedcba)
190{
191 const __m128 hgfe = hi128(hgfedcba);
192 const __m128 dcba = lo128(hgfedcba);
193 __m128 l = _mm_min_ps(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea
194 __m128 h = _mm_max_ps(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea
195
196 __m128 x = _mm_unpacklo_ps(l, h); // ↑fb ↓fb ↑ea ↓ea
197 __m128 y = _mm_unpackhi_ps(l, h); // ↑hd ↓hd ↑gc ↓gc
198
199 l = _mm_min_ps(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca
200 h = _mm_max_ps(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc)
201
202 x = _mm_min_ps(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca)
203 y = _mm_max_ps(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca)
204
205 __m128 a = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(x), _mm_castps_pd(y))); // a3 >= a2 >= a1 >= a0
206 __m128 b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0
207
208 // merge
209 l = _mm_min_ps(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
210 h = _mm_max_ps(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
211
212 a = _mm_unpacklo_ps(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0
213 b = _mm_unpackhi_ps(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2
214 l = _mm_min_ps(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2
215 h = _mm_max_ps(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2)
216
217 a = _mm_unpacklo_ps(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2
218 b = _mm_unpackhi_ps(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3
219 l = _mm_min_ps(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3
220 h = _mm_max_ps(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3)
221
222 return concat(_mm_unpacklo_ps(l, h), _mm_unpackhi_ps(l, h));
223}
224
225template<> __m256 SortHelper<sfloat>::sort(__m256 hgfedcba)
226{
227 return SortHelper<float>::sort(hgfedcba);
228}
229
230template<> void SortHelper<double>::sort(__m256d &VC_RESTRICT x, __m256d &VC_RESTRICT y)
231{
232 __m256d l = _mm256_min_pd(x, y); // ↓x3y3 ↓x2y2 ↓x1y1 ↓x0y0
233 __m256d h = _mm256_max_pd(x, y); // ↑x3y3 ↑x2y2 ↑x1y1 ↑x0y0
234 x = _mm256_unpacklo_pd(l, h); // ↑x2y2 ↓x2y2 ↑x0y0 ↓x0y0
235 y = _mm256_unpackhi_pd(l, h); // ↑x3y3 ↓x3y3 ↑x1y1 ↓x1y1
236 l = _mm256_min_pd(x, y); // ↓(↑x2y2,↑x3y3) ↓x3x2y3y2 ↓(↑x0y0,↑x1y1) ↓x1x0y1y0
237 h = _mm256_max_pd(x, y); // ↑x3x2y3y2 ↑(↓x2y2,↓x3y3) ↑x1x0y1y0 ↑(↓x0y0,↓x1y1)
238 x = _mm256_unpacklo_pd(l, h); // ↑(↓x2y2,↓x3y3) ↓x3x2y3y2 ↑(↓x0y0,↓x1y1) ↓x1x0y1y0
239 y = _mm256_unpackhi_pd(h, l); // ↓(↑x2y2,↑x3y3) ↑x3x2y3y2 ↓(↑x0y0,↑x1y1) ↑x1x0y1y0
240 l = _mm256_min_pd(x, y); // ↓(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↓x3x2y3y2 ↓(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↓x1x0y1y0
241 h = _mm256_max_pd(x, y); // ↑(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↑x3x2y3y2 ↑(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↑x1x0y1y0
242 __m256d a = Reg::permute<X2, X3, X1, X0>(Reg::permute128<X0, X1>(h, h)); // h0 h1 h3 h2
243 __m256d b = Reg::permute<X2, X3, X1, X0>(l); // l2 l3 l1 l0
244
245 // a3 >= a2 >= b1 >= b0
246 // b3 <= b2 <= a1 <= a0
247
248 // merge
249 l = _mm256_min_pd(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0
250 h = _mm256_min_pd(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0
251
252 x = _mm256_unpacklo_pd(l, h); // ↑a2b2 ↓a2b2 ↑a0b0 ↓a0b0
253 y = _mm256_unpackhi_pd(l, h); // ↑a3b3 ↓a3b3 ↑a1b1 ↓a1b1
254 l = _mm256_min_pd(x, y); // ↓(↑a2b2,↑a3b3) ↓a2b3 ↓(↑a0b0,↑a1b1) ↓a1b0
255 h = _mm256_min_pd(x, y); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↑a0b1 ↑(↓a0b0,↓a1b1)
256
257 x = Reg::permute128<Y0, X0>(l, h); // ↑a0b1 ↑(↓a0b0,↓a1b1) ↓(↑a0b0,↑a1b1) ↓a1b0
258 y = Reg::permute128<Y1, X1>(l, h); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↓(↑a2b2,↑a3b3) ↓a2b3
259 l = _mm256_min_pd(x, y); // ↓(↑a0b1,↑a3b2) ↓(↑(↓a0b0,↓a1b1) ↑(↓a2b2,↓a3b3)) ↓(↑a0b0,↑a1b1,↑a2b2,↑a3b3) ↓b0b3
260 h = _mm256_min_pd(x, y); // ↑a0a3 ↑(↓a0b0,↓a1b1,↓a2b2,↓a3b3) ↑(↓(↑a0b0,↑a1b1) ↓(↑a2b2,↑a3b3)) ↑(↓a1b0,↓a2b3)
261
262 x = _mm256_unpacklo_pd(l, h); // h2 l2 h0 l0
263 y = _mm256_unpackhi_pd(l, h); // h3 l3 h1 l1
264}
265template<> __m256d SortHelper<double>::sort(__m256d dcba)
266{
267 /*
268 * to find the second largest number find
269 * max(min(max(ab),max(cd)), min(max(ad),max(bc)))
270 * or
271 * max(max(min(ab),min(cd)), min(max(ab),max(cd)))
272 *
273 const __m256d adcb = avx_cast<__m256d>(concat(_mm_alignr_epi8(avx_cast<__m128i>(dc), avx_cast<__m128i>(ba), 8), _mm_alignr_epi8(avx_cast<__m128i>(ba), avx_cast<__m128i>(dc), 8)));
274 const __m256d l = _mm256_min_pd(dcba, adcb); // min(ad cd bc ab)
275 const __m256d h = _mm256_max_pd(dcba, adcb); // max(ad cd bc ab)
276 // max(h3, h1)
277 // max(min(h0,h2), min(h3,h1))
278 // min(max(l0,l2), max(l3,l1))
279 // min(l3, l1)
280
281 const __m256d ll = _mm256_min_pd(h, Reg::permute128<X0, X1>(h, h)); // min(h3h1 h2h0 h1h3 h0h2)
282 //const __m256d hh = _mm256_max_pd(h3 ll1_3 l1 l0, h1 ll0_2 l3 l2);
283 const __m256d hh = _mm256_max_pd(
284 Reg::permute128<X1, Y0>(_mm256_unpackhi_pd(ll, h), l),
285 Reg::permute128<X0, Y1>(_mm256_blend_pd(h ll, 0x1), l));
286 _mm256_min_pd(hh0, hh1
287 */
288
289 //////////////////////////////////////////////////////////////////////////////////
290 // max(max(ac), max(bd))
291 // max(max(min(ac),min(bd)), min(max(ac),max(bd)))
292 // min(max(min(ac),min(bd)), min(max(ac),max(bd)))
293 // min(min(ac), min(bd))
294 __m128d l = _mm_min_pd(lo128(dcba), hi128(dcba)); // min(bd) min(ac)
295 __m128d h = _mm_max_pd(lo128(dcba), hi128(dcba)); // max(bd) max(ac)
296 __m128d h0_l0 = _mm_unpacklo_pd(l, h);
297 __m128d h1_l1 = _mm_unpackhi_pd(l, h);
298 l = _mm_min_pd(h0_l0, h1_l1);
299 h = _mm_max_pd(h0_l0, h1_l1);
300 return concat(
301 _mm_min_pd(l, Reg::permute<X0, X0>(h)),
302 _mm_max_pd(h, Reg::permute<X1, X1>(l))
303 );
304 // extract: 1 cycle
305 // min/max: 4 cycles
306 // unpacklo/hi: 2 cycles
307 // min/max: 4 cycles
308 // permute: 1 cycle
309 // min/max: 4 cycles
310 // insert: 1 cycle
311 // ----------------------
312 // total: 17 cycles
313
314 /*
315 __m256d cdab = Reg::permute<X2, X3, X0, X1>(dcba);
316 __m256d l = _mm256_min_pd(dcba, cdab);
317 __m256d h = _mm256_max_pd(dcba, cdab);
318 __m256d maxmin_ba = Reg::permute128<X0, Y0>(l, h);
319 __m256d maxmin_dc = Reg::permute128<X1, Y1>(l, h);
320
321 l = _mm256_min_pd(maxmin_ba, maxmin_dc);
322 h = _mm256_max_pd(maxmin_ba, maxmin_dc);
323
324 return _mm256_blend_pd(h, l, 0x55);
325 */
326
327 /*
328 // a b c d
329 // b a d c
330 // sort pairs
331 __m256d y, l, h;
332 __m128d l2, h2;
333 y = shuffle<X1, Y0, X3, Y2>(x, x);
334 l = _mm256_min_pd(x, y); // min[ab ab cd cd]
335 h = _mm256_max_pd(x, y); // max[ab ab cd cd]
336
337 // 1 of 2 is at [0]
338 // 1 of 4 is at [1]
339 // 1 of 4 is at [2]
340 // 1 of 2 is at [3]
341
342 // don't be fooled by unpack here. It works differently for AVX pd than for SSE ps
343 x = _mm256_unpacklo_pd(l, h); // l_ab h_ab l_cd h_cd
344 l2 = _mm_min_pd(lo128(x), hi128(x)); // l_abcd l(h_ab hcd)
345 h2 = _mm_max_pd(lo128(x), hi128(x)); // h(l_ab l_cd) h_abcd
346
347 // either it is:
348 return concat(l2, h2);
349 // or:
350 // concat(_mm_unpacklo_pd(l2, h2), _mm_unpackhi_pd(l2, h2));
351
352 // I'd like to have four useful compares
353 const __m128d dc = hi128(dcba);
354 const __m128d ba = lo128(dcba);
355 const __m256d adcb = avx_cast<__m256d>(concat(_mm_alignr_epi8(avx_cast<__m128i>(dc), avx_cast<__m128i>(ba), 8), _mm_alignr_epi8(avx_cast<__m128i>(ba), avx_cast<__m128i>(dc), 8)));
356
357 const int extraCmp = _mm_movemask_pd(_mm_cmpgt_pd(dc, ba));
358 // 0x0: d <= b && c <= a
359 // 0x1: d <= b && c > a
360 // 0x2: d > b && c <= a
361 // 0x3: d > b && c > a
362
363 switch (_mm256_movemask_pd(_mm256_cmpgt_pd(dcba, adcb))) {
364 // impossible: 0x0, 0xf
365 case 0x1: // a <= b && b <= c && c <= d && d > a
366 // abcd
367 return Reg::permute<X2, X3, X0, X1>(Reg::permute<X0, X1>(dcba, dcba));
368 case 0x2: // a <= b && b <= c && c > d && d <= a
369 // dabc
370 return Reg::permute<X2, X3, X0, X1>(adcb);
371 case 0x3: // a <= b && b <= c && c > d && d > a
372 // a[bd]c
373 if (extraCmp & 2) {
374 // abdc
375 return Reg::permute<X2, X3, X1, X0>(Reg::permute<X0, X1>(dcba, dcba));
376 } else {
377 // adbc
378 return Reg::permute<X3, X2, X0, X1>(adcb);
379 }
380 case 0x4: // a <= b && b > c && c <= d && d <= a
381 // cdab;
382 return Reg::permute<X2, X3, X0, X1>(dcba);
383 case 0x5: // a <= b && b > c && c <= d && d > a
384 // [ac] < [bd]
385 switch (extraCmp) {
386 case 0x0: // d <= b && c <= a
387 // cadb
388 return shuffle<>(dcba, bcda);
389 case 0x1: // d <= b && c > a
390 case 0x2: // d > b && c <= a
391 case 0x3: // d > b && c > a
392 }
393 case 0x6: // a <= b && b > c && c > d && d <= a
394 // d[ac]b
395 case 0x7: // a <= b && b > c && c > d && d > a
396 // adcb;
397 return permute<X1, X0, X3, X2>(permute128<X1, X0>(bcda, bcda));
398 case 0x8: // a > b && b <= c && c <= d && d <= a
399 return bcda;
400 case 0x9: // a > b && b <= c && c <= d && d > a
401 // b[ac]d;
402 case 0xa: // a > b && b <= c && c > d && d <= a
403 // [ac] > [bd]
404 case 0xb: // a > b && b <= c && c > d && d > a
405 // badc;
406 return permute128<X1, X0>(dcba);
407 case 0xc: // a > b && b > c && c <= d && d <= a
408 // c[bd]a;
409 case 0xd: // a > b && b > c && c <= d && d > a
410 // cbad;
411 return permute<X1, X0, X3, X2>(bcda);
412 case 0xe: // a > b && b > c && c > d && d <= a
413 return dcba;
414 }
415 */
416}
417
418} // namespace AVX
419} // namespace Vc