]>
Commit | Line | Data |
---|---|---|
f22341db | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2009-2011 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
20 | #include "casts.h" | |
21 | #include <cstdlib> | |
22 | ||
c017a39f | 23 | namespace AliRoot { |
f22341db | 24 | namespace Vc |
25 | { | |
26 | namespace SSE | |
27 | { | |
28 | ||
29 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
30 | // float_v | |
c017a39f | 31 | template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, AlignedFlag) |
f22341db | 32 | { |
33 | return _mm_load_ps(x); | |
34 | } | |
35 | ||
c017a39f | 36 | template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, UnalignedFlag) |
f22341db | 37 | { |
38 | return _mm_loadu_ps(x); | |
39 | } | |
40 | ||
c017a39f | 41 | template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, StreamingAndAlignedFlag) |
f22341db | 42 | { |
43 | return _mm_stream_load(x); | |
44 | } | |
45 | ||
c017a39f | 46 | template<> Vc_ALWAYS_INLINE Vc_PURE _M128 VectorHelper<_M128>::load(const float *x, StreamingAndUnalignedFlag) |
f22341db | 47 | { |
48 | return load(x, Unaligned); | |
49 | } | |
50 | ||
51 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
52 | // stores | |
c017a39f | 53 | Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, AlignedFlag) |
f22341db | 54 | { |
55 | _mm_store_ps(mem, x); | |
56 | } | |
c017a39f | 57 | Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, UnalignedFlag) |
f22341db | 58 | { |
59 | _mm_storeu_ps(mem, x); | |
60 | } | |
c017a39f | 61 | Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, StreamingAndAlignedFlag) |
f22341db | 62 | { |
63 | _mm_stream_ps(mem, x); | |
64 | } | |
c017a39f | 65 | Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, StreamingAndUnalignedFlag) |
f22341db | 66 | { |
67 | _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); | |
68 | } | |
c017a39f | 69 | Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, AlignedFlag) |
f22341db | 70 | { |
71 | _mm_store_ps(mem, _mm_blendv_ps(_mm_load_ps(mem), x, m)); | |
72 | } | |
c017a39f | 73 | Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, UnalignedFlag) |
f22341db | 74 | { |
75 | _mm_storeu_ps(mem, _mm_blendv_ps(_mm_loadu_ps(mem), x, m)); | |
76 | } | |
c017a39f | 77 | Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) |
f22341db | 78 | { |
79 | _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast<char *>(mem)); | |
80 | } | |
c017a39f | 81 | Vc_ALWAYS_INLINE void VectorHelper<_M128>::store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) |
f22341db | 82 | { |
83 | _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast<char *>(mem)); | |
84 | } | |
85 | ||
86 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
87 | // sfloat_v | |
c017a39f | 88 | template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper<M256>::load(const float *x, AlignedFlag) |
f22341db | 89 | { |
90 | return VectorType::create(_mm_load_ps(x), _mm_load_ps(x + 4)); | |
91 | } | |
92 | ||
c017a39f | 93 | template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper<M256>::load(const float *x, UnalignedFlag) |
f22341db | 94 | { |
95 | return VectorType::create(_mm_loadu_ps(x), _mm_loadu_ps(x + 4)); | |
96 | } | |
97 | ||
c017a39f | 98 | template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper<M256>::load(const float *x, StreamingAndAlignedFlag) |
f22341db | 99 | { |
100 | return VectorType::create(_mm_stream_load(&x[0]), _mm_stream_load(&x[4])); | |
101 | } | |
102 | ||
c017a39f | 103 | template<> Vc_ALWAYS_INLINE Vc_PURE M256 VectorHelper<M256>::load(const float *x, StreamingAndUnalignedFlag) |
f22341db | 104 | { |
105 | return load(x, Unaligned); | |
106 | } | |
107 | ||
108 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
109 | // stores | |
c017a39f | 110 | Vc_ALWAYS_INLINE void VectorHelper<M256>::store(float *mem, VectorTypeArg x, AlignedFlag) |
f22341db | 111 | { |
112 | _mm_store_ps(mem, x[0]); | |
113 | _mm_store_ps(mem + 4, x[1]); | |
114 | } | |
c017a39f | 115 | Vc_ALWAYS_INLINE void VectorHelper<M256>::store(float *mem, VectorTypeArg x, UnalignedFlag) |
f22341db | 116 | { |
117 | _mm_storeu_ps(mem, x[0]); | |
118 | _mm_storeu_ps(mem + 4, x[1]); | |
119 | } | |
c017a39f | 120 | Vc_ALWAYS_INLINE void VectorHelper<M256>::store(float *mem, VectorTypeArg x, StreamingAndAlignedFlag) |
f22341db | 121 | { |
122 | _mm_stream_ps(mem, x[0]); | |
123 | _mm_stream_ps(mem + 4, x[1]); | |
124 | } | |
c017a39f | 125 | Vc_ALWAYS_INLINE void VectorHelper<M256>::store(float *mem, VectorTypeArg x, StreamingAndUnalignedFlag) |
f22341db | 126 | { |
127 | _mm_maskmoveu_si128(_mm_castps_si128(x[0]), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); | |
128 | _mm_maskmoveu_si128(_mm_castps_si128(x[1]), _mm_setallone_si128(), reinterpret_cast<char *>(mem + 4)); | |
129 | } | |
c017a39f | 130 | Vc_ALWAYS_INLINE void VectorHelper<M256>::store(float *mem, VectorTypeArg x, VectorTypeArg m, AlignedFlag) |
f22341db | 131 | { |
132 | _mm_store_ps(mem, _mm_blendv_ps(_mm_load_ps(mem), x[0], m[0])); | |
133 | _mm_store_ps(mem + 4, _mm_blendv_ps(_mm_load_ps(mem + 4), x[1], m[1])); | |
134 | } | |
c017a39f | 135 | Vc_ALWAYS_INLINE void VectorHelper<M256>::store(float *mem, VectorTypeArg x, VectorTypeArg m, UnalignedFlag) |
f22341db | 136 | { |
137 | _mm_storeu_ps(mem, _mm_blendv_ps(_mm_loadu_ps(mem), x[0], m[0])); | |
138 | _mm_storeu_ps(mem + 4, _mm_blendv_ps(_mm_loadu_ps(mem + 4), x[1], m[1])); | |
139 | } | |
c017a39f | 140 | Vc_ALWAYS_INLINE void VectorHelper<M256>::store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndAlignedFlag) |
f22341db | 141 | { |
142 | _mm_maskmoveu_si128(_mm_castps_si128(x[0]), _mm_castps_si128(m[0]), reinterpret_cast<char *>(mem)); | |
143 | _mm_maskmoveu_si128(_mm_castps_si128(x[1]), _mm_castps_si128(m[1]), reinterpret_cast<char *>(mem + 4)); | |
144 | } | |
c017a39f | 145 | Vc_ALWAYS_INLINE void VectorHelper<M256>::store(float *mem, VectorTypeArg x, VectorTypeArg m, StreamingAndUnalignedFlag) |
f22341db | 146 | { |
147 | _mm_maskmoveu_si128(_mm_castps_si128(x[0]), _mm_castps_si128(m[0]), reinterpret_cast<char *>(mem)); | |
148 | _mm_maskmoveu_si128(_mm_castps_si128(x[1]), _mm_castps_si128(m[1]), reinterpret_cast<char *>(mem + 4)); | |
149 | } | |
150 | ||
151 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
152 | // double_v | |
c017a39f | 153 | template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, AlignedFlag) |
f22341db | 154 | { |
155 | return _mm_load_pd(x); | |
156 | } | |
157 | ||
c017a39f | 158 | template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, UnalignedFlag) |
f22341db | 159 | { |
160 | return _mm_loadu_pd(x); | |
161 | } | |
162 | ||
c017a39f | 163 | template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, StreamingAndAlignedFlag) |
f22341db | 164 | { |
165 | return _mm_stream_load(x); | |
166 | } | |
167 | ||
c017a39f | 168 | template<> Vc_ALWAYS_INLINE Vc_PURE _M128D VectorHelper<_M128D>::load(const double *x, StreamingAndUnalignedFlag) |
f22341db | 169 | { |
170 | return load(x, Unaligned); | |
171 | } | |
172 | ||
173 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
174 | // stores | |
c017a39f | 175 | Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, AlignedFlag) |
f22341db | 176 | { |
177 | _mm_store_pd(mem, x); | |
178 | } | |
c017a39f | 179 | Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, UnalignedFlag) |
f22341db | 180 | { |
181 | _mm_storeu_pd(mem, x); | |
182 | } | |
c017a39f | 183 | Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, StreamingAndAlignedFlag) |
f22341db | 184 | { |
185 | _mm_stream_pd(mem, x); | |
186 | } | |
c017a39f | 187 | Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, StreamingAndUnalignedFlag) |
f22341db | 188 | { |
189 | _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); | |
190 | } | |
c017a39f | 191 | Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, AlignedFlag) |
f22341db | 192 | { |
193 | _mm_store_pd(mem, _mm_blendv_pd(_mm_load_pd(mem), x, m)); | |
194 | } | |
c017a39f | 195 | Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, UnalignedFlag) |
f22341db | 196 | { |
197 | _mm_storeu_pd(mem, _mm_blendv_pd(_mm_loadu_pd(mem), x, m)); | |
198 | } | |
c017a39f | 199 | Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) |
f22341db | 200 | { |
201 | _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast<char *>(mem)); | |
202 | } | |
c017a39f | 203 | Vc_ALWAYS_INLINE void VectorHelper<_M128D>::store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) |
f22341db | 204 | { |
205 | _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast<char *>(mem)); | |
206 | } | |
207 | ||
208 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
209 | // int_v, uint_v, short_v, ushort_v | |
c017a39f | 210 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, AlignedFlag) |
f22341db | 211 | { |
212 | return _mm_load_si128(reinterpret_cast<const VectorType *>(x)); | |
213 | } | |
214 | ||
c017a39f | 215 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, UnalignedFlag) |
f22341db | 216 | { |
217 | return _mm_loadu_si128(reinterpret_cast<const VectorType *>(x)); | |
218 | } | |
219 | ||
c017a39f | 220 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, StreamingAndAlignedFlag) |
f22341db | 221 | { |
222 | return _mm_stream_load(x); | |
223 | } | |
224 | ||
c017a39f | 225 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE _M128I VectorHelper<_M128I>::load(const T *x, StreamingAndUnalignedFlag) |
f22341db | 226 | { |
227 | return load(x, Unaligned); | |
228 | } | |
229 | ||
230 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
231 | // stores | |
c017a39f | 232 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, AlignedFlag) |
f22341db | 233 | { |
234 | _mm_store_si128(reinterpret_cast<VectorType *>(mem), x); | |
235 | } | |
c017a39f | 236 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, UnalignedFlag) |
f22341db | 237 | { |
238 | _mm_storeu_si128(reinterpret_cast<VectorType *>(mem), x); | |
239 | } | |
c017a39f | 240 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, StreamingAndAlignedFlag) |
f22341db | 241 | { |
242 | _mm_stream_si128(reinterpret_cast<VectorType *>(mem), x); | |
243 | } | |
c017a39f | 244 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, StreamingAndUnalignedFlag) |
f22341db | 245 | { |
246 | _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast<char *>(mem)); | |
247 | } | |
c017a39f | 248 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, AlignedFlag align) |
f22341db | 249 | { |
250 | store(mem, _mm_blendv_epi8(load(mem, align), x, m), align); | |
251 | } | |
c017a39f | 252 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, UnalignedFlag align) |
f22341db | 253 | { |
254 | store(mem, _mm_blendv_epi8(load(mem, align), x, m), align); | |
255 | } | |
c017a39f | 256 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag) |
f22341db | 257 | { |
258 | _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); | |
259 | } | |
c017a39f | 260 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<_M128I>::store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag) |
f22341db | 261 | { |
262 | _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); | |
263 | } | |
264 | ||
c017a39f | 265 | template<> inline Vc_CONST _M128I SortHelper<_M128I, 8>::sort(_M128I x) |
f22341db | 266 | { |
267 | _M128I lo, hi, y; | |
268 | // sort pairs | |
c017a39f | 269 | y = Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(x); |
f22341db | 270 | lo = _mm_min_epi16(x, y); |
271 | hi = _mm_max_epi16(x, y); | |
272 | x = _mm_blend_epi16(lo, hi, 0xaa); | |
273 | ||
274 | // merge left and right quads | |
c017a39f | 275 | y = Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(x); |
f22341db | 276 | lo = _mm_min_epi16(x, y); |
277 | hi = _mm_max_epi16(x, y); | |
278 | x = _mm_blend_epi16(lo, hi, 0xcc); | |
279 | y = _mm_srli_si128(x, 2); | |
280 | lo = _mm_min_epi16(x, y); | |
281 | hi = _mm_max_epi16(x, y); | |
282 | x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); | |
283 | ||
284 | // merge quads into octs | |
285 | y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); | |
286 | y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); | |
287 | lo = _mm_min_epi16(x, y); | |
288 | hi = _mm_max_epi16(x, y); | |
289 | ||
290 | x = _mm_unpacklo_epi16(lo, hi); | |
291 | y = _mm_srli_si128(x, 8); | |
292 | lo = _mm_min_epi16(x, y); | |
293 | hi = _mm_max_epi16(x, y); | |
294 | ||
295 | x = _mm_unpacklo_epi16(lo, hi); | |
296 | y = _mm_srli_si128(x, 8); | |
297 | lo = _mm_min_epi16(x, y); | |
298 | hi = _mm_max_epi16(x, y); | |
299 | ||
300 | return _mm_unpacklo_epi16(lo, hi); | |
301 | } | |
c017a39f | 302 | template<> inline Vc_CONST _M128I SortHelper<_M128I, 4>::sort(_M128I x) |
f22341db | 303 | { |
304 | /* | |
305 | // in 16,67% of the cases the merge can be replaced by an append | |
306 | ||
307 | // x = [a b c d] | |
308 | // y = [c d a b] | |
309 | _M128I y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); | |
310 | _M128I l = _mm_min_epi32(x, y); // min[ac bd ac bd] | |
311 | _M128I h = _mm_max_epi32(x, y); // max[ac bd ac bd] | |
312 | if (IS_UNLIKELY(_mm_cvtsi128_si32(h) <= l[1])) { // l[0] < h[0] < l[1] < h[1] | |
313 | return _mm_unpacklo_epi32(l, h); | |
314 | } | |
315 | // h[0] > l[1] | |
316 | */ | |
317 | ||
318 | // sort pairs | |
319 | _M128I y = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)); | |
320 | _M128I l = _mm_min_epi32(x, y); | |
321 | _M128I h = _mm_max_epi32(x, y); | |
322 | x = _mm_unpacklo_epi32(l, h); | |
323 | y = _mm_unpackhi_epi32(h, l); | |
324 | ||
325 | // sort quads | |
326 | l = _mm_min_epi32(x, y); | |
327 | h = _mm_max_epi32(x, y); | |
328 | x = _mm_unpacklo_epi32(l, h); | |
329 | y = _mm_unpackhi_epi64(x, x); | |
330 | ||
331 | l = _mm_min_epi32(x, y); | |
332 | h = _mm_max_epi32(x, y); | |
333 | return _mm_unpacklo_epi32(l, h); | |
334 | } | |
c017a39f | 335 | template<> inline Vc_CONST _M128 SortHelper<_M128, 4>::sort(_M128 x) |
f22341db | 336 | { |
337 | _M128 y = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)); | |
338 | _M128 l = _mm_min_ps(x, y); | |
339 | _M128 h = _mm_max_ps(x, y); | |
340 | x = _mm_unpacklo_ps(l, h); | |
341 | y = _mm_unpackhi_ps(h, l); | |
342 | ||
343 | l = _mm_min_ps(x, y); | |
344 | h = _mm_max_ps(x, y); | |
345 | x = _mm_unpacklo_ps(l, h); | |
346 | y = _mm_movehl_ps(x, x); | |
347 | ||
348 | l = _mm_min_ps(x, y); | |
349 | h = _mm_max_ps(x, y); | |
350 | return _mm_unpacklo_ps(l, h); | |
351 | //X _M128 k = _mm_cmpgt_ps(x, y); | |
352 | //X k = _mm_shuffle_ps(k, k, _MM_SHUFFLE(2, 2, 0, 0)); | |
353 | //X x = _mm_blendv_ps(x, y, k); | |
354 | //X y = _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)); | |
355 | //X k = _mm_cmpgt_ps(x, y); | |
356 | //X k = _mm_shuffle_ps(k, k, _MM_SHUFFLE(1, 0, 1, 0)); | |
357 | //X x = _mm_blendv_ps(x, y, k); | |
358 | //X y = _mm_shuffle_ps(x, x, _MM_SHUFFLE(3, 1, 2, 0)); | |
359 | //X k = _mm_cmpgt_ps(x, y); | |
360 | //X k = _mm_shuffle_ps(k, k, _MM_SHUFFLE(0, 1, 1, 0)); | |
361 | //X return _mm_blendv_ps(x, y, k); | |
362 | } | |
c017a39f | 363 | template<> inline Vc_PURE M256 SortHelper<M256, 8>::sort(const M256 &_x) |
f22341db | 364 | { |
365 | M256 x = _x; | |
366 | typedef SortHelper<_M128, 4> H; | |
367 | ||
368 | _M128 a, b, l, h; | |
369 | a = H::sort(x[0]); | |
370 | b = H::sort(x[1]); | |
371 | ||
372 | // merge | |
373 | b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)); | |
374 | l = _mm_min_ps(a, b); | |
375 | h = _mm_max_ps(a, b); | |
376 | ||
377 | a = _mm_unpacklo_ps(l, h); | |
378 | b = _mm_unpackhi_ps(l, h); | |
379 | l = _mm_min_ps(a, b); | |
380 | h = _mm_max_ps(a, b); | |
381 | ||
382 | a = _mm_unpacklo_ps(l, h); | |
383 | b = _mm_unpackhi_ps(l, h); | |
384 | l = _mm_min_ps(a, b); | |
385 | h = _mm_max_ps(a, b); | |
386 | ||
387 | x[0] = _mm_unpacklo_ps(l, h); | |
388 | x[1] = _mm_unpackhi_ps(l, h); | |
389 | return x; | |
390 | } | |
c017a39f | 391 | template<> inline Vc_CONST _M128D SortHelper<_M128D, 2>::sort(_M128D x) |
f22341db | 392 | { |
393 | const _M128D y = _mm_shuffle_pd(x, x, _MM_SHUFFLE2(0, 1)); | |
394 | return _mm_unpacklo_pd(_mm_min_sd(x, y), _mm_max_sd(x, y)); | |
395 | } | |
396 | ||
397 | // can be used to multiply with a constant. For some special constants it doesn't need an extra | |
398 | // vector but can use a shift instead, basically encoding the factor in the instruction. | |
c017a39f | 399 | template<typename IndexType, unsigned int constant> Vc_ALWAYS_INLINE Vc_CONST IndexType mulConst(const IndexType x) { |
f22341db | 400 | typedef VectorHelper<typename IndexType::EntryType> H; |
401 | switch (constant) { | |
402 | case 0: return H::zero(); | |
403 | case 1: return x; | |
404 | case 2: return H::slli(x.data(), 1); | |
405 | case 4: return H::slli(x.data(), 2); | |
406 | case 8: return H::slli(x.data(), 3); | |
407 | case 16: return H::slli(x.data(), 4); | |
408 | case 32: return H::slli(x.data(), 5); | |
409 | case 64: return H::slli(x.data(), 6); | |
410 | case 128: return H::slli(x.data(), 7); | |
411 | case 256: return H::slli(x.data(), 8); | |
412 | case 512: return H::slli(x.data(), 9); | |
413 | case 1024: return H::slli(x.data(), 10); | |
414 | case 2048: return H::slli(x.data(), 11); | |
415 | } | |
416 | #ifndef VC_IMPL_SSE4_1 | |
417 | // without SSE 4.1 int multiplication is not so nice | |
418 | if (sizeof(typename IndexType::EntryType) == 4) { | |
419 | switch (constant) { | |
420 | case 3: return H::add( x.data() , H::slli(x.data(), 1)); | |
421 | case 5: return H::add( x.data() , H::slli(x.data(), 2)); | |
422 | case 9: return H::add( x.data() , H::slli(x.data(), 3)); | |
423 | case 17: return H::add( x.data() , H::slli(x.data(), 4)); | |
424 | case 33: return H::add( x.data() , H::slli(x.data(), 5)); | |
425 | case 65: return H::add( x.data() , H::slli(x.data(), 6)); | |
426 | case 129: return H::add( x.data() , H::slli(x.data(), 7)); | |
427 | case 257: return H::add( x.data() , H::slli(x.data(), 8)); | |
428 | case 513: return H::add( x.data() , H::slli(x.data(), 9)); | |
429 | case 1025: return H::add( x.data() , H::slli(x.data(), 10)); | |
430 | case 2049: return H::add( x.data() , H::slli(x.data(), 11)); | |
431 | case 6: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 2)); | |
432 | case 10: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 3)); | |
433 | case 18: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 4)); | |
434 | case 34: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 5)); | |
435 | case 66: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 6)); | |
436 | case 130: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 7)); | |
437 | case 258: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 8)); | |
438 | case 514: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 9)); | |
439 | case 1026: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 10)); | |
440 | case 2050: return H::add(H::slli(x.data(), 1), H::slli(x.data(), 11)); | |
441 | case 12: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 3)); | |
442 | case 20: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 4)); | |
443 | case 36: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 5)); | |
444 | case 68: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 6)); | |
445 | case 132: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 7)); | |
446 | case 260: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 8)); | |
447 | case 516: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 9)); | |
448 | case 1028: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 10)); | |
449 | case 2052: return H::add(H::slli(x.data(), 2), H::slli(x.data(), 11)); | |
450 | case 24: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 4)); | |
451 | case 40: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 5)); | |
452 | case 72: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 6)); | |
453 | case 136: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 7)); | |
454 | case 264: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 8)); | |
455 | case 520: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 9)); | |
456 | case 1032: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 10)); | |
457 | case 2056: return H::add(H::slli(x.data(), 3), H::slli(x.data(), 11)); | |
458 | case 48: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 5)); | |
459 | case 80: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 6)); | |
460 | case 144: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 7)); | |
461 | case 272: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 8)); | |
462 | case 528: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 9)); | |
463 | case 1040: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 10)); | |
464 | case 2064: return H::add(H::slli(x.data(), 4), H::slli(x.data(), 11)); | |
465 | case 96: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 6)); | |
466 | case 160: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 7)); | |
467 | case 288: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 8)); | |
468 | case 544: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 9)); | |
469 | case 1056: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 10)); | |
470 | case 2080: return H::add(H::slli(x.data(), 5), H::slli(x.data(), 11)); | |
471 | case 192: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 7)); | |
472 | case 320: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 8)); | |
473 | case 576: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 9)); | |
474 | case 1088: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 10)); | |
475 | case 2112: return H::add(H::slli(x.data(), 6), H::slli(x.data(), 11)); | |
476 | case 384: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 8)); | |
477 | case 640: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 9)); | |
478 | case 1152: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 10)); | |
479 | case 2176: return H::add(H::slli(x.data(), 7), H::slli(x.data(), 11)); | |
480 | case 768: return H::add(H::slli(x.data(), 8), H::slli(x.data(), 9)); | |
481 | case 1280: return H::add(H::slli(x.data(), 8), H::slli(x.data(), 10)); | |
482 | case 2304: return H::add(H::slli(x.data(), 8), H::slli(x.data(), 11)); | |
483 | case 1536: return H::add(H::slli(x.data(), 9), H::slli(x.data(), 10)); | |
484 | case 2560: return H::add(H::slli(x.data(), 9), H::slli(x.data(), 11)); | |
485 | case 3072: return H::add(H::slli(x.data(),10), H::slli(x.data(), 11)); | |
486 | } | |
487 | } | |
488 | #endif | |
489 | return H::mul(x.data(), H::set(constant)); | |
490 | } | |
491 | } // namespace SSE | |
492 | } // namespace Vc | |
c017a39f | 493 | } // namespace AliRoot |