]>
Commit | Line | Data |
---|---|---|
f22341db | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2011 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
7c616f25 | 20 | #include <Vc/avx/intrinsics.h> |
21 | #include <Vc/avx/casts.h> | |
22 | #include <Vc/avx/sorthelper.h> | |
23 | #include <Vc/avx/macros.h> | |
f22341db | 24 | |
c017a39f | 25 | namespace AliRoot { |
f22341db | 26 | namespace Vc |
27 | { | |
28 | namespace AVX | |
29 | { | |
30 | ||
c017a39f | 31 | template<> m128i SortHelper<short>::sort(VTArg _x) |
f22341db | 32 | { |
c017a39f | 33 | m128i lo, hi, y, x = _x; |
f22341db | 34 | // sort pairs |
35 | y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); | |
36 | lo = _mm_min_epi16(x, y); | |
37 | hi = _mm_max_epi16(x, y); | |
38 | x = _mm_blend_epi16(lo, hi, 0xaa); | |
39 | ||
40 | // merge left and right quads | |
41 | y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3)); | |
42 | lo = _mm_min_epi16(x, y); | |
43 | hi = _mm_max_epi16(x, y); | |
44 | x = _mm_blend_epi16(lo, hi, 0xcc); | |
45 | y = _mm_srli_si128(x, 2); | |
46 | lo = _mm_min_epi16(x, y); | |
47 | hi = _mm_max_epi16(x, y); | |
48 | x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); | |
49 | ||
50 | // merge quads into octs | |
51 | y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); | |
52 | y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); | |
53 | lo = _mm_min_epi16(x, y); | |
54 | hi = _mm_max_epi16(x, y); | |
55 | ||
56 | x = _mm_unpacklo_epi16(lo, hi); | |
57 | y = _mm_srli_si128(x, 8); | |
58 | lo = _mm_min_epi16(x, y); | |
59 | hi = _mm_max_epi16(x, y); | |
60 | ||
61 | x = _mm_unpacklo_epi16(lo, hi); | |
62 | y = _mm_srli_si128(x, 8); | |
63 | lo = _mm_min_epi16(x, y); | |
64 | hi = _mm_max_epi16(x, y); | |
65 | ||
66 | return _mm_unpacklo_epi16(lo, hi); | |
67 | } | |
c017a39f | 68 | template<> m128i SortHelper<unsigned short>::sort(VTArg _x) |
f22341db | 69 | { |
c017a39f | 70 | m128i lo, hi, y, x = _x; |
f22341db | 71 | // sort pairs |
72 | y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); | |
73 | lo = _mm_min_epu16(x, y); | |
74 | hi = _mm_max_epu16(x, y); | |
75 | x = _mm_blend_epi16(lo, hi, 0xaa); | |
76 | ||
77 | // merge left and right quads | |
78 | y = _mm_shufflelo_epi16(_mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3)); | |
79 | lo = _mm_min_epu16(x, y); | |
80 | hi = _mm_max_epu16(x, y); | |
81 | x = _mm_blend_epi16(lo, hi, 0xcc); | |
82 | y = _mm_srli_si128(x, 2); | |
83 | lo = _mm_min_epu16(x, y); | |
84 | hi = _mm_max_epu16(x, y); | |
85 | x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); | |
86 | ||
87 | // merge quads into octs | |
88 | y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); | |
89 | y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); | |
90 | lo = _mm_min_epu16(x, y); | |
91 | hi = _mm_max_epu16(x, y); | |
92 | ||
93 | x = _mm_unpacklo_epi16(lo, hi); | |
94 | y = _mm_srli_si128(x, 8); | |
95 | lo = _mm_min_epu16(x, y); | |
96 | hi = _mm_max_epu16(x, y); | |
97 | ||
98 | x = _mm_unpacklo_epi16(lo, hi); | |
99 | y = _mm_srli_si128(x, 8); | |
100 | lo = _mm_min_epu16(x, y); | |
101 | hi = _mm_max_epu16(x, y); | |
102 | ||
103 | return _mm_unpacklo_epi16(lo, hi); | |
104 | } | |
105 | ||
c017a39f | 106 | template<> m256i SortHelper<int>::sort(VTArg _hgfedcba) |
f22341db | 107 | { |
c017a39f | 108 | VectorType hgfedcba = _hgfedcba; |
109 | const m128i hgfe = hi128(hgfedcba); | |
110 | const m128i dcba = lo128(hgfedcba); | |
111 | m128i l = _mm_min_epi32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea | |
112 | m128i h = _mm_max_epi32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea | |
f22341db | 113 | |
c017a39f | 114 | m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea |
115 | m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc | |
f22341db | 116 | |
117 | l = _mm_min_epi32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca | |
118 | h = _mm_max_epi32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc) | |
119 | ||
120 | x = _mm_min_epi32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca) | |
121 | y = _mm_max_epi32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca) | |
122 | ||
c017a39f | 123 | m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0 |
124 | m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0 | |
f22341db | 125 | |
79c86c14 | 126 | // _mm_extract_epi32 from clang < 3.4 returns an unsigned int - the static_cast is free for |
127 | // conforming compilers, but fixes broken ones | |
128 | if (VC_IS_UNLIKELY(static_cast<int>(_mm_extract_epi32(x, 2)) >= static_cast<int>(_mm_extract_epi32(y, 1)))) { | |
f22341db | 129 | return concat(Reg::permute<X0, X1, X2, X3>(b), a); |
79c86c14 | 130 | } else if (VC_IS_UNLIKELY(static_cast<int>(_mm_extract_epi32(x, 0)) >= static_cast<int>(_mm_extract_epi32(y, 3)))) { |
f22341db | 131 | return concat(a, Reg::permute<X0, X1, X2, X3>(b)); |
132 | } | |
133 | ||
134 | // merge | |
135 | l = _mm_min_epi32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 | |
136 | h = _mm_max_epi32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 | |
137 | ||
138 | a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0 | |
139 | b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2 | |
140 | l = _mm_min_epi32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2 | |
141 | h = _mm_max_epi32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2) | |
142 | ||
143 | a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2 | |
144 | b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3 | |
145 | l = _mm_min_epi32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3 | |
146 | h = _mm_max_epi32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3) | |
147 | ||
148 | return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h)); | |
149 | } | |
150 | ||
c017a39f | 151 | template<> m256i SortHelper<unsigned int>::sort(VTArg _hgfedcba) |
f22341db | 152 | { |
c017a39f | 153 | VectorType hgfedcba = _hgfedcba; |
154 | const m128i hgfe = hi128(hgfedcba); | |
155 | const m128i dcba = lo128(hgfedcba); | |
156 | m128i l = _mm_min_epu32(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea | |
157 | m128i h = _mm_max_epu32(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea | |
f22341db | 158 | |
c017a39f | 159 | m128i x = _mm_unpacklo_epi32(l, h); // ↑fb ↓fb ↑ea ↓ea |
160 | m128i y = _mm_unpackhi_epi32(l, h); // ↑hd ↓hd ↑gc ↓gc | |
f22341db | 161 | |
162 | l = _mm_min_epu32(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca | |
163 | h = _mm_max_epu32(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc) | |
164 | ||
165 | x = _mm_min_epu32(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca) | |
166 | y = _mm_max_epu32(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca) | |
167 | ||
c017a39f | 168 | m128i b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0 |
169 | m128i a = _mm_unpackhi_epi64(x, y); // a3 >= a2 >= a1 >= a0 | |
f22341db | 170 | |
171 | if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 2) >= _mm_extract_epu32(y, 1))) { | |
172 | return concat(Reg::permute<X0, X1, X2, X3>(b), a); | |
173 | } else if (VC_IS_UNLIKELY(_mm_extract_epu32(x, 0) >= _mm_extract_epu32(y, 3))) { | |
174 | return concat(a, Reg::permute<X0, X1, X2, X3>(b)); | |
175 | } | |
176 | ||
177 | // merge | |
178 | l = _mm_min_epu32(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 | |
179 | h = _mm_max_epu32(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 | |
180 | ||
181 | a = _mm_unpacklo_epi32(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0 | |
182 | b = _mm_unpackhi_epi32(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2 | |
183 | l = _mm_min_epu32(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2 | |
184 | h = _mm_max_epu32(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2) | |
185 | ||
186 | a = _mm_unpacklo_epi32(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2 | |
187 | b = _mm_unpackhi_epi32(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3 | |
188 | l = _mm_min_epu32(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3 | |
189 | h = _mm_max_epu32(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3) | |
190 | ||
191 | return concat(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h)); | |
192 | } | |
193 | ||
c017a39f | 194 | template<> m256 SortHelper<float>::sort(VTArg _hgfedcba) |
f22341db | 195 | { |
c017a39f | 196 | VectorType hgfedcba = _hgfedcba; |
197 | const m128 hgfe = hi128(hgfedcba); | |
198 | const m128 dcba = lo128(hgfedcba); | |
199 | m128 l = _mm_min_ps(hgfe, dcba); // ↓hd ↓gc ↓fb ↓ea | |
200 | m128 h = _mm_max_ps(hgfe, dcba); // ↑hd ↑gc ↑fb ↑ea | |
f22341db | 201 | |
c017a39f | 202 | m128 x = _mm_unpacklo_ps(l, h); // ↑fb ↓fb ↑ea ↓ea |
203 | m128 y = _mm_unpackhi_ps(l, h); // ↑hd ↓hd ↑gc ↓gc | |
f22341db | 204 | |
205 | l = _mm_min_ps(x, y); // ↓(↑fb,↑hd) ↓hfdb ↓(↑ea,↑gc) ↓geca | |
206 | h = _mm_max_ps(x, y); // ↑hfdb ↑(↓fb,↓hd) ↑geca ↑(↓ea,↓gc) | |
207 | ||
208 | x = _mm_min_ps(l, Reg::permute<X2, X2, X0, X0>(h)); // 2(hfdb) 1(hfdb) 2(geca) 1(geca) | |
209 | y = _mm_max_ps(h, Reg::permute<X3, X3, X1, X1>(l)); // 4(hfdb) 3(hfdb) 4(geca) 3(geca) | |
210 | ||
c017a39f | 211 | m128 a = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(x), _mm_castps_pd(y))); // a3 >= a2 >= a1 >= a0 |
212 | m128 b = Reg::shuffle<Y0, Y1, X0, X1>(y, x); // b3 <= b2 <= b1 <= b0 | |
f22341db | 213 | |
214 | // merge | |
215 | l = _mm_min_ps(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 | |
216 | h = _mm_max_ps(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 | |
217 | ||
218 | a = _mm_unpacklo_ps(l, h); // ↑a1b1 ↓a1b1 ↑a0b0 ↓a0b0 | |
219 | b = _mm_unpackhi_ps(l, h); // ↑a3b3 ↓a3b3 ↑a2b2 ↓a2b2 | |
220 | l = _mm_min_ps(a, b); // ↓(↑a1b1,↑a3b3) ↓a1b3 ↓(↑a0b0,↑a2b2) ↓a0b2 | |
221 | h = _mm_max_ps(a, b); // ↑a3b1 ↑(↓a1b1,↓a3b3) ↑a2b0 ↑(↓a0b0,↓a2b2) | |
222 | ||
223 | a = _mm_unpacklo_ps(l, h); // ↑a2b0 ↓(↑a0b0,↑a2b2) ↑(↓a0b0,↓a2b2) ↓a0b2 | |
224 | b = _mm_unpackhi_ps(l, h); // ↑a3b1 ↓(↑a1b1,↑a3b3) ↑(↓a1b1,↓a3b3) ↓a1b3 | |
225 | l = _mm_min_ps(a, b); // ↓(↑a2b0,↑a3b1) ↓(↑a0b0,↑a2b2,↑a1b1,↑a3b3) ↓(↑(↓a0b0,↓a2b2) ↑(↓a1b1,↓a3b3)) ↓a0b3 | |
226 | h = _mm_max_ps(a, b); // ↑a3b0 ↑(↓(↑a0b0,↑a2b2) ↓(↑a1b1,↑a3b3)) ↑(↓a0b0,↓a2b2,↓a1b1,↓a3b3) ↑(↓a0b2,↓a1b3) | |
227 | ||
228 | return concat(_mm_unpacklo_ps(l, h), _mm_unpackhi_ps(l, h)); | |
229 | } | |
230 | ||
c017a39f | 231 | template<> m256 SortHelper<sfloat>::sort(VTArg hgfedcba) |
f22341db | 232 | { |
233 | return SortHelper<float>::sort(hgfedcba); | |
234 | } | |
235 | ||
c017a39f | 236 | template<> void SortHelper<double>::sort(m256d &VC_RESTRICT x, m256d &VC_RESTRICT y) |
f22341db | 237 | { |
c017a39f | 238 | m256d l = _mm256_min_pd(x, y); // ↓x3y3 ↓x2y2 ↓x1y1 ↓x0y0 |
239 | m256d h = _mm256_max_pd(x, y); // ↑x3y3 ↑x2y2 ↑x1y1 ↑x0y0 | |
f22341db | 240 | x = _mm256_unpacklo_pd(l, h); // ↑x2y2 ↓x2y2 ↑x0y0 ↓x0y0 |
241 | y = _mm256_unpackhi_pd(l, h); // ↑x3y3 ↓x3y3 ↑x1y1 ↓x1y1 | |
242 | l = _mm256_min_pd(x, y); // ↓(↑x2y2,↑x3y3) ↓x3x2y3y2 ↓(↑x0y0,↑x1y1) ↓x1x0y1y0 | |
243 | h = _mm256_max_pd(x, y); // ↑x3x2y3y2 ↑(↓x2y2,↓x3y3) ↑x1x0y1y0 ↑(↓x0y0,↓x1y1) | |
244 | x = _mm256_unpacklo_pd(l, h); // ↑(↓x2y2,↓x3y3) ↓x3x2y3y2 ↑(↓x0y0,↓x1y1) ↓x1x0y1y0 | |
245 | y = _mm256_unpackhi_pd(h, l); // ↓(↑x2y2,↑x3y3) ↑x3x2y3y2 ↓(↑x0y0,↑x1y1) ↑x1x0y1y0 | |
246 | l = _mm256_min_pd(x, y); // ↓(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↓x3x2y3y2 ↓(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↓x1x0y1y0 | |
247 | h = _mm256_max_pd(x, y); // ↑(↑(↓x2y2,↓x3y3) ↓(↑x2y2,↑x3y3)) ↑x3x2y3y2 ↑(↑(↓x0y0,↓x1y1) ↓(↑x0y0,↑x1y1)) ↑x1x0y1y0 | |
c017a39f | 248 | m256d a = Reg::permute<X2, X3, X1, X0>(Reg::permute128<X0, X1>(h, h)); // h0 h1 h3 h2 |
249 | m256d b = Reg::permute<X2, X3, X1, X0>(l); // l2 l3 l1 l0 | |
f22341db | 250 | |
251 | // a3 >= a2 >= b1 >= b0 | |
252 | // b3 <= b2 <= a1 <= a0 | |
253 | ||
254 | // merge | |
255 | l = _mm256_min_pd(a, b); // ↓a3b3 ↓a2b2 ↓a1b1 ↓a0b0 | |
256 | h = _mm256_min_pd(a, b); // ↑a3b3 ↑a2b2 ↑a1b1 ↑a0b0 | |
257 | ||
258 | x = _mm256_unpacklo_pd(l, h); // ↑a2b2 ↓a2b2 ↑a0b0 ↓a0b0 | |
259 | y = _mm256_unpackhi_pd(l, h); // ↑a3b3 ↓a3b3 ↑a1b1 ↓a1b1 | |
260 | l = _mm256_min_pd(x, y); // ↓(↑a2b2,↑a3b3) ↓a2b3 ↓(↑a0b0,↑a1b1) ↓a1b0 | |
261 | h = _mm256_min_pd(x, y); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↑a0b1 ↑(↓a0b0,↓a1b1) | |
262 | ||
263 | x = Reg::permute128<Y0, X0>(l, h); // ↑a0b1 ↑(↓a0b0,↓a1b1) ↓(↑a0b0,↑a1b1) ↓a1b0 | |
264 | y = Reg::permute128<Y1, X1>(l, h); // ↑a3b2 ↑(↓a2b2,↓a3b3) ↓(↑a2b2,↑a3b3) ↓a2b3 | |
265 | l = _mm256_min_pd(x, y); // ↓(↑a0b1,↑a3b2) ↓(↑(↓a0b0,↓a1b1) ↑(↓a2b2,↓a3b3)) ↓(↑a0b0,↑a1b1,↑a2b2,↑a3b3) ↓b0b3 | |
266 | h = _mm256_min_pd(x, y); // ↑a0a3 ↑(↓a0b0,↓a1b1,↓a2b2,↓a3b3) ↑(↓(↑a0b0,↑a1b1) ↓(↑a2b2,↑a3b3)) ↑(↓a1b0,↓a2b3) | |
267 | ||
268 | x = _mm256_unpacklo_pd(l, h); // h2 l2 h0 l0 | |
269 | y = _mm256_unpackhi_pd(l, h); // h3 l3 h1 l1 | |
270 | } | |
c017a39f | 271 | template<> m256d SortHelper<double>::sort(VTArg _dcba) |
f22341db | 272 | { |
c017a39f | 273 | VectorType dcba = _dcba; |
f22341db | 274 | /* |
275 | * to find the second largest number find | |
276 | * max(min(max(ab),max(cd)), min(max(ad),max(bc))) | |
277 | * or | |
278 | * max(max(min(ab),min(cd)), min(max(ab),max(cd))) | |
279 | * | |
c017a39f | 280 | const m256d adcb = avx_cast<m256d>(concat(_mm_alignr_epi8(avx_cast<m128i>(dc), avx_cast<m128i>(ba), 8), _mm_alignr_epi8(avx_cast<m128i>(ba), avx_cast<m128i>(dc), 8))); |
281 | const m256d l = _mm256_min_pd(dcba, adcb); // min(ad cd bc ab) | |
282 | const m256d h = _mm256_max_pd(dcba, adcb); // max(ad cd bc ab) | |
f22341db | 283 | // max(h3, h1) |
284 | // max(min(h0,h2), min(h3,h1)) | |
285 | // min(max(l0,l2), max(l3,l1)) | |
286 | // min(l3, l1) | |
287 | ||
c017a39f | 288 | const m256d ll = _mm256_min_pd(h, Reg::permute128<X0, X1>(h, h)); // min(h3h1 h2h0 h1h3 h0h2) |
289 | //const m256d hh = _mm256_max_pd(h3 ll1_3 l1 l0, h1 ll0_2 l3 l2); | |
290 | const m256d hh = _mm256_max_pd( | |
f22341db | 291 | Reg::permute128<X1, Y0>(_mm256_unpackhi_pd(ll, h), l), |
292 | Reg::permute128<X0, Y1>(_mm256_blend_pd(h ll, 0x1), l)); | |
293 | _mm256_min_pd(hh0, hh1 | |
294 | */ | |
295 | ||
296 | ////////////////////////////////////////////////////////////////////////////////// | |
297 | // max(max(ac), max(bd)) | |
298 | // max(max(min(ac),min(bd)), min(max(ac),max(bd))) | |
299 | // min(max(min(ac),min(bd)), min(max(ac),max(bd))) | |
300 | // min(min(ac), min(bd)) | |
c017a39f | 301 | m128d l = _mm_min_pd(lo128(dcba), hi128(dcba)); // min(bd) min(ac) |
302 | m128d h = _mm_max_pd(lo128(dcba), hi128(dcba)); // max(bd) max(ac) | |
303 | m128d h0_l0 = _mm_unpacklo_pd(l, h); | |
304 | m128d h1_l1 = _mm_unpackhi_pd(l, h); | |
f22341db | 305 | l = _mm_min_pd(h0_l0, h1_l1); |
306 | h = _mm_max_pd(h0_l0, h1_l1); | |
307 | return concat( | |
308 | _mm_min_pd(l, Reg::permute<X0, X0>(h)), | |
309 | _mm_max_pd(h, Reg::permute<X1, X1>(l)) | |
310 | ); | |
311 | // extract: 1 cycle | |
312 | // min/max: 4 cycles | |
313 | // unpacklo/hi: 2 cycles | |
314 | // min/max: 4 cycles | |
315 | // permute: 1 cycle | |
316 | // min/max: 4 cycles | |
317 | // insert: 1 cycle | |
318 | // ---------------------- | |
319 | // total: 17 cycles | |
320 | ||
321 | /* | |
c017a39f | 322 | m256d cdab = Reg::permute<X2, X3, X0, X1>(dcba); |
323 | m256d l = _mm256_min_pd(dcba, cdab); | |
324 | m256d h = _mm256_max_pd(dcba, cdab); | |
325 | m256d maxmin_ba = Reg::permute128<X0, Y0>(l, h); | |
326 | m256d maxmin_dc = Reg::permute128<X1, Y1>(l, h); | |
f22341db | 327 | |
328 | l = _mm256_min_pd(maxmin_ba, maxmin_dc); | |
329 | h = _mm256_max_pd(maxmin_ba, maxmin_dc); | |
330 | ||
331 | return _mm256_blend_pd(h, l, 0x55); | |
332 | */ | |
333 | ||
334 | /* | |
335 | // a b c d | |
336 | // b a d c | |
337 | // sort pairs | |
c017a39f | 338 | m256d y, l, h; |
339 | m128d l2, h2; | |
f22341db | 340 | y = shuffle<X1, Y0, X3, Y2>(x, x); |
341 | l = _mm256_min_pd(x, y); // min[ab ab cd cd] | |
342 | h = _mm256_max_pd(x, y); // max[ab ab cd cd] | |
343 | ||
344 | // 1 of 2 is at [0] | |
345 | // 1 of 4 is at [1] | |
346 | // 1 of 4 is at [2] | |
347 | // 1 of 2 is at [3] | |
348 | ||
349 | // don't be fooled by unpack here. It works differently for AVX pd than for SSE ps | |
350 | x = _mm256_unpacklo_pd(l, h); // l_ab h_ab l_cd h_cd | |
351 | l2 = _mm_min_pd(lo128(x), hi128(x)); // l_abcd l(h_ab hcd) | |
352 | h2 = _mm_max_pd(lo128(x), hi128(x)); // h(l_ab l_cd) h_abcd | |
353 | ||
354 | // either it is: | |
355 | return concat(l2, h2); | |
356 | // or: | |
357 | // concat(_mm_unpacklo_pd(l2, h2), _mm_unpackhi_pd(l2, h2)); | |
358 | ||
359 | // I'd like to have four useful compares | |
c017a39f | 360 | const m128d dc = hi128(dcba); |
361 | const m128d ba = lo128(dcba); | |
362 | const m256d adcb = avx_cast<m256d>(concat(_mm_alignr_epi8(avx_cast<m128i>(dc), avx_cast<m128i>(ba), 8), _mm_alignr_epi8(avx_cast<m128i>(ba), avx_cast<m128i>(dc), 8))); | |
f22341db | 363 | |
364 | const int extraCmp = _mm_movemask_pd(_mm_cmpgt_pd(dc, ba)); | |
365 | // 0x0: d <= b && c <= a | |
366 | // 0x1: d <= b && c > a | |
367 | // 0x2: d > b && c <= a | |
368 | // 0x3: d > b && c > a | |
369 | ||
370 | switch (_mm256_movemask_pd(_mm256_cmpgt_pd(dcba, adcb))) { | |
371 | // impossible: 0x0, 0xf | |
372 | case 0x1: // a <= b && b <= c && c <= d && d > a | |
373 | // abcd | |
374 | return Reg::permute<X2, X3, X0, X1>(Reg::permute<X0, X1>(dcba, dcba)); | |
375 | case 0x2: // a <= b && b <= c && c > d && d <= a | |
376 | // dabc | |
377 | return Reg::permute<X2, X3, X0, X1>(adcb); | |
378 | case 0x3: // a <= b && b <= c && c > d && d > a | |
379 | // a[bd]c | |
380 | if (extraCmp & 2) { | |
381 | // abdc | |
382 | return Reg::permute<X2, X3, X1, X0>(Reg::permute<X0, X1>(dcba, dcba)); | |
383 | } else { | |
384 | // adbc | |
385 | return Reg::permute<X3, X2, X0, X1>(adcb); | |
386 | } | |
387 | case 0x4: // a <= b && b > c && c <= d && d <= a | |
388 | // cdab; | |
389 | return Reg::permute<X2, X3, X0, X1>(dcba); | |
390 | case 0x5: // a <= b && b > c && c <= d && d > a | |
391 | // [ac] < [bd] | |
392 | switch (extraCmp) { | |
393 | case 0x0: // d <= b && c <= a | |
394 | // cadb | |
395 | return shuffle<>(dcba, bcda); | |
396 | case 0x1: // d <= b && c > a | |
397 | case 0x2: // d > b && c <= a | |
398 | case 0x3: // d > b && c > a | |
399 | } | |
400 | case 0x6: // a <= b && b > c && c > d && d <= a | |
401 | // d[ac]b | |
402 | case 0x7: // a <= b && b > c && c > d && d > a | |
403 | // adcb; | |
404 | return permute<X1, X0, X3, X2>(permute128<X1, X0>(bcda, bcda)); | |
405 | case 0x8: // a > b && b <= c && c <= d && d <= a | |
406 | return bcda; | |
407 | case 0x9: // a > b && b <= c && c <= d && d > a | |
408 | // b[ac]d; | |
409 | case 0xa: // a > b && b <= c && c > d && d <= a | |
410 | // [ac] > [bd] | |
411 | case 0xb: // a > b && b <= c && c > d && d > a | |
412 | // badc; | |
413 | return permute128<X1, X0>(dcba); | |
414 | case 0xc: // a > b && b > c && c <= d && d <= a | |
415 | // c[bd]a; | |
416 | case 0xd: // a > b && b > c && c <= d && d > a | |
417 | // cbad; | |
418 | return permute<X1, X0, X3, X2>(bcda); | |
419 | case 0xe: // a > b && b > c && c > d && d <= a | |
420 | return dcba; | |
421 | } | |
422 | */ | |
423 | } | |
424 | ||
425 | } // namespace AVX | |
426 | } // namespace Vc | |
c017a39f | 427 | } // namespace AliRoot |