Vc/include/Vc/avx/vectorhelper.h

   1 /*  This file is part of the Vc library.
   2
   3     Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
   4
   5     Vc is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU Lesser General Public License as
   7     published by the Free Software Foundation, either version 3 of
   8     the License, or (at your option) any later version.
   9
  10     Vc is distributed in the hope that it will be useful, but
  11     WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with Vc.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 */
  19
  20 #ifndef AVX_VECTORHELPER_H
  21 #define AVX_VECTORHELPER_H
  22
  23 #include <limits>
  24 #include "types.h"
  25 #include "intrinsics.h"
  26 #include "casts.h"
  27 #include "macros.h"
  28
  29 namespace AliRoot {
  30 namespace Vc
  31 {
  32 namespace AVX
  33 {
  34
  35 namespace Internal
  36 {
  37 Vc_INTRINSIC Vc_CONST m256 exponent(param256 v)
  38 {
  39     m128i tmp0 = _mm_srli_epi32(avx_cast<m128i>(v), 23);
  40     m128i tmp1 = _mm_srli_epi32(avx_cast<m128i>(hi128(v)), 23);
  41     tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
  42     tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
  43     return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
  44 }
  45 Vc_INTRINSIC Vc_CONST m256d exponent(param256d v)
  46 {
  47     m128i tmp0 = _mm_srli_epi64(avx_cast<m128i>(v), 52);
  48     m128i tmp1 = _mm_srli_epi64(avx_cast<m128i>(hi128(v)), 52);
  49     tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
  50     tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
  51     return _mm256_cvtepi32_pd(avx_cast<m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<m128>(tmp0), avx_cast<m128>(tmp1))));
  52 }
  53 } // namespace Internal
  54
  55 #define OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
  56 #define OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a) { return code; }
  57 #define OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b) { return code; }
  58 #define OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(VTArg a, VTArg b, VTArg c) { return code; }
  59
  60         template<> struct VectorHelper<m256>
  61         {
  62             typedef m256 VectorType;
  63 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
  64             typedef const VectorType & VTArg;
  65 #else
  66             typedef const VectorType VTArg;
  67 #endif
  68             template<typename A> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const float *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R;
  69             static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R;
  70             static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R;
  71             static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
  72             static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
  73             static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R;
  74             static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R;
  75             static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
  76             static Vc_ALWAYS_INLINE_L void store(float *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
  77
  78             static Vc_ALWAYS_INLINE Vc_CONST VectorType cdab(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 3, 0, 1)); }
  79             static Vc_ALWAYS_INLINE Vc_CONST VectorType badc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); }
  80             static Vc_ALWAYS_INLINE Vc_CONST VectorType aaaa(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)); }
  81             static Vc_ALWAYS_INLINE Vc_CONST VectorType bbbb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)); }
  82             static Vc_ALWAYS_INLINE Vc_CONST VectorType cccc(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)); }
  83             static Vc_ALWAYS_INLINE Vc_CONST VectorType dddd(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)); }
  84             static Vc_ALWAYS_INLINE Vc_CONST VectorType dacb(VTArg x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 0, 2, 1)); }
  85
  86             OP0(allone, _mm256_setallone_ps())
  87             OP0(zero, _mm256_setzero_ps())
  88             OP2(or_, _mm256_or_ps(a, b))
  89             OP2(xor_, _mm256_xor_ps(a, b))
  90             OP2(and_, _mm256_and_ps(a, b))
  91             OP2(andnot_, _mm256_andnot_ps(a, b))
  92             OP3(blend, _mm256_blendv_ps(a, b, c))
  93         };
  94
  95         template<> struct VectorHelper<m256d>
  96         {
  97             typedef m256d VectorType;
  98 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
  99             typedef const VectorType & VTArg;
 100 #else
 101             typedef const VectorType VTArg;
 102 #endif
 103             template<typename A> static Vc_ALWAYS_INLINE_L Vc_PURE_L VectorType load(const double *x, A) Vc_ALWAYS_INLINE_R Vc_PURE_R;
 104             static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, AlignedFlag) Vc_ALWAYS_INLINE_R;
 105             static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, UnalignedFlag) Vc_ALWAYS_INLINE_R;
 106             static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
 107             static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
 108             static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, AlignedFlag) Vc_ALWAYS_INLINE_R;
 109             static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, UnalignedFlag) Vc_ALWAYS_INLINE_R;
 110             static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) Vc_ALWAYS_INLINE_R;
 111             static Vc_ALWAYS_INLINE_L void store(double *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) Vc_ALWAYS_INLINE_R;
 112
 113             static VectorType cdab(VTArg x) { return _mm256_permute_pd(x, 5); }
 114             static VectorType badc(VTArg x) { return _mm256_permute2f128_pd(x, x, 1); }
 115             // aaaa bbbb cccc dddd specialized in vector.tcc
 116             static VectorType dacb(VTArg x) {
 117                 const m128d cb = avx_cast<m128d>(_mm_alignr_epi8(avx_cast<m128i>(lo128(x)),
 118                             avx_cast<m128i>(hi128(x)), sizeof(double))); // XXX: lo and hi swapped?
 119                 const m128d da = _mm_blend_pd(lo128(x), hi128(x), 0 + 2); // XXX: lo and hi swapped?
 120                 return concat(cb, da);
 121             }
 122
 123             OP0(allone, _mm256_setallone_pd())
 124             OP0(zero, _mm256_setzero_pd())
 125             OP2(or_, _mm256_or_pd(a, b))
 126             OP2(xor_, _mm256_xor_pd(a, b))
 127             OP2(and_, _mm256_and_pd(a, b))
 128             OP2(andnot_, _mm256_andnot_pd(a, b))
 129             OP3(blend, _mm256_blendv_pd(a, b, c))
 130         };
 131
 132         template<> struct VectorHelper<m256i>
 133         {
 134             typedef m256i VectorType;
 135 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
 136             typedef const VectorType & VTArg;
 137 #else
 138             typedef const VectorType VTArg;
 139 #endif
 140             template<typename T> static VectorType load(const T *x, AlignedFlag) Vc_PURE;
 141             template<typename T> static VectorType load(const T *x, UnalignedFlag) Vc_PURE;
 142             template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE;
 143             template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE;
 144             template<typename T> static void store(T *mem, VTArg x, AlignedFlag);
 145             template<typename T> static void store(T *mem, VTArg x, UnalignedFlag);
 146             template<typename T> static void store(T *mem, VTArg x, StreamingAndAlignedFlag);
 147             template<typename T> static void store(T *mem, VTArg x, StreamingAndUnalignedFlag);
 148             template<typename T> static void store(T *mem, VTArg x, VTArg m, AlignedFlag);
 149             template<typename T> static void store(T *mem, VTArg x, VTArg m, UnalignedFlag);
 150             template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag);
 151             template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag);
 152
 153             static VectorType cdab(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(2, 3, 0, 1))); }
 154             static VectorType badc(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(1, 0, 3, 2))); }
 155             static VectorType aaaa(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(0, 0, 0, 0))); }
 156             static VectorType bbbb(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(1, 1, 1, 1))); }
 157             static VectorType cccc(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(2, 2, 2, 2))); }
 158             static VectorType dddd(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(3, 3, 3, 3))); }
 159             static VectorType dacb(VTArg x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<m256>(x), _MM_SHUFFLE(3, 0, 2, 1))); }
 160
 161             OP0(allone, _mm256_setallone_si256())
 162             OP0(zero, _mm256_setzero_si256())
 163             OP2(or_, _mm256_or_si256(a, b))
 164             OP2(xor_, _mm256_xor_si256(a, b))
 165             OP2(and_, _mm256_and_si256(a, b))
 166             OP2(andnot_, _mm256_andnot_si256(a, b))
 167             OP3(blend, _mm256_blendv_epi8(a, b, c))
 168         };
 169
 170         template<> struct VectorHelper<m128i>
 171         {
 172             typedef m128i VectorType;
 173 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
 174             typedef const VectorType & VTArg;
 175 #else
 176             typedef const VectorType VTArg;
 177 #endif
 178             template<typename T> static VectorType load(const T *x, AlignedFlag) Vc_PURE;
 179             template<typename T> static VectorType load(const T *x, UnalignedFlag) Vc_PURE;
 180             template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) Vc_PURE;
 181             template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) Vc_PURE;
 182             template<typename T> static void store(T *mem, VTArg x, AlignedFlag);
 183             template<typename T> static void store(T *mem, VTArg x, UnalignedFlag);
 184             template<typename T> static void store(T *mem, VTArg x, StreamingAndAlignedFlag);
 185             template<typename T> static void store(T *mem, VTArg x, StreamingAndUnalignedFlag);
 186             template<typename T> static void store(T *mem, VTArg x, VTArg m, AlignedFlag);
 187             template<typename T> static void store(T *mem, VTArg x, VTArg m, UnalignedFlag);
 188             template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag);
 189             template<typename T> static void store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag);
 190
 191             static VectorType cdab(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)); }
 192             static VectorType badc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)); }
 193             static VectorType aaaa(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 0, 0)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(0, 0, 0, 0)); }
 194             static VectorType bbbb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 1, 1, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(1, 1, 1, 1)); }
 195             static VectorType cccc(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 2, 2, 2)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(2, 2, 2, 2)); }
 196             static VectorType dddd(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 3, 3, 3)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 3, 3, 3)); }
 197             static VectorType dacb(VTArg x) { const __m128i tmp = _mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 0, 2, 1)); return _mm_shufflehi_epi16(tmp, _MM_SHUFFLE(3, 0, 2, 1)); }
 198
 199             OP0(allone, _mm_setallone_si128())
 200             OP0(zero, _mm_setzero_si128())
 201             OP2(or_, _mm_or_si128(a, b))
 202             OP2(xor_, _mm_xor_si128(a, b))
 203             OP2(and_, _mm_and_si128(a, b))
 204             OP2(andnot_, _mm_andnot_si128(a, b))
 205             OP3(blend, _mm_blendv_epi8(a, b, c))
 206         };
 207 #undef OP1
 208 #undef OP2
 209 #undef OP3
 210
 211 #define OP1(op) \
 212         static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return CAT(_mm256_##op##_, SUFFIX)(a); }
 213 #define OP(op) \
 214         static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op##_ , SUFFIX)(a, b); }
 215 #define OP_(op) \
 216         static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op    , SUFFIX)(a, b); }
 217 #define OPx(op, op2) \
 218         static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_##op2##_, SUFFIX)(a, b); }
 219 #define OPcmp(op) \
 220         static Vc_INTRINSIC VectorType Vc_CONST cmp##op(VTArg a, VTArg b) { return CAT(_mm256_cmp##op##_, SUFFIX)(a, b); }
 221 #define OP_CAST_(op) \
 222         static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return CAT(_mm256_castps_, SUFFIX)( \
 223             _mm256_##op##ps(CAT(CAT(_mm256_cast, SUFFIX), _ps)(a), \
 224               CAT(CAT(_mm256_cast, SUFFIX), _ps)(b))); \
 225         }
 226 #define MINMAX \
 227         static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return CAT(_mm256_min_, SUFFIX)(a, b); } \
 228         static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return CAT(_mm256_max_, SUFFIX)(a, b); }
 229
 230         template<> struct VectorHelper<double> {
 231             typedef m256d VectorType;
 232 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
 233             typedef const VectorType & VTArg;
 234 #else
 235             typedef const VectorType VTArg;
 236 #endif
 237             typedef double EntryType;
 238             typedef double ConcatType;
 239 #define SUFFIX pd
 240
 241             static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_pd(mask), a); }
 242             static Vc_ALWAYS_INLINE VectorType set(const double a) { return CAT(_mm256_set1_, SUFFIX)(a); }
 243             static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) {
 244                 return CAT(_mm256_set_, SUFFIX)(a, b, c, d);
 245             }
 246             static Vc_ALWAYS_INLINE VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
 247             static Vc_ALWAYS_INLINE VectorType one()  { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.); }
 248
 249             static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
 250 #ifdef VC_IMPL_FMA4
 251                 v1 = _mm256_macc_pd(v1, v2, v3);
 252 #else
 253                 VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
 254                 VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
 255 #if defined(VC_GCC) && VC_GCC < 0x40703
 256                 // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot
 257                 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703
 258                 asm("":"+x"(h1), "+x"(h2));
 259 #endif
 260                 const VectorType l1 = _mm256_sub_pd(v1, h1);
 261                 const VectorType l2 = _mm256_sub_pd(v2, h2);
 262                 const VectorType ll = mul(l1, l2);
 263                 const VectorType lh = add(mul(l1, h2), mul(h1, l2));
 264                 const VectorType hh = mul(h1, h2);
 265                 // ll < lh < hh for all entries is certain
 266                 const VectorType lh_lt_v3 = cmplt(abs(lh), abs(v3)); // |lh| < |v3|
 267                 const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3);
 268                 const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3);
 269                 v1 = add(add(ll, b), add(c, hh));
 270 #endif
 271             }
 272
 273             OP(add) OP(sub) OP(mul)
 274             OPcmp(eq) OPcmp(neq)
 275             OPcmp(lt) OPcmp(nlt)
 276             OPcmp(le) OPcmp(nle)
 277
 278             OP1(sqrt)
 279             static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) {
 280                 return _mm256_div_pd(one(), sqrt(x));
 281             }
 282             static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
 283                 return _mm256_div_pd(one(), x);
 284             }
 285             static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) {
 286                 return _mm256_cmpunord_pd(x, x);
 287             }
 288             static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) {
 289                 return _mm256_cmpord_pd(x, _mm256_mul_pd(zero(), x));
 290             }
 291             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
 292                 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_pd());
 293             }
 294
 295             MINMAX
 296             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
 297                 m128d b = _mm_min_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
 298                 b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
 299                 return _mm_cvtsd_f64(b);
 300             }
 301             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
 302                 m128d b = _mm_max_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
 303                 b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
 304                 return _mm_cvtsd_f64(b);
 305             }
 306             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
 307                 m128d b = _mm_mul_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
 308                 b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
 309                 return _mm_cvtsd_f64(b);
 310             }
 311             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
 312                 m128d b = _mm_add_pd(avx_cast<m128d>(a), _mm256_extractf128_pd(a, 1));
 313                 b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
 314                 return _mm_cvtsd_f64(b);
 315             }
 316 #undef SUFFIX
 317             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
 318                 return _mm256_round_pd(a, _MM_FROUND_NINT);
 319             }
 320         };
 321
 322         template<> struct VectorHelper<float> {
 323             typedef float EntryType;
 324             typedef m256 VectorType;
 325 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
 326             typedef const VectorType & VTArg;
 327 #else
 328             typedef const VectorType VTArg;
 329 #endif
 330             typedef double ConcatType;
 331 #define SUFFIX ps
 332
 333             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(mask, a); }
 334             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return CAT(_mm256_set1_, SUFFIX)(a); }
 335             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
 336                     const float e, const float f, const float g, const float h) {
 337                 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
 338             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
 339             static Vc_ALWAYS_INLINE Vc_CONST VectorType one()  { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.f); }
 340             static Vc_ALWAYS_INLINE Vc_CONST m256 concat(param256d a, param256d b) { return _mm256_insertf128_ps(avx_cast<m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
 341
 342             static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
 343 #ifdef VC_IMPL_FMA4
 344                 v1 = _mm256_macc_ps(v1, v2, v3);
 345 #else
 346                 m256d v1_0 = _mm256_cvtps_pd(lo128(v1));
 347                 m256d v1_1 = _mm256_cvtps_pd(hi128(v1));
 348                 m256d v2_0 = _mm256_cvtps_pd(lo128(v2));
 349                 m256d v2_1 = _mm256_cvtps_pd(hi128(v2));
 350                 m256d v3_0 = _mm256_cvtps_pd(lo128(v3));
 351                 m256d v3_1 = _mm256_cvtps_pd(hi128(v3));
 352                 v1 = AVX::concat(
 353                         _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
 354                         _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
 355 #endif
 356             }
 357
 358             OP(add) OP(sub) OP(mul)
 359             OPcmp(eq) OPcmp(neq)
 360             OPcmp(lt) OPcmp(nlt)
 361             OPcmp(le) OPcmp(nle)
 362
 363             OP1(sqrt) OP1(rsqrt)
 364             static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VTArg x) {
 365                 return _mm256_cmpunord_ps(x, x);
 366             }
 367             static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VTArg x) {
 368                 return _mm256_cmpord_ps(x, _mm256_mul_ps(zero(), x));
 369             }
 370             static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
 371                 return _mm256_rcp_ps(x);
 372             }
 373             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
 374                 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_ps());
 375             }
 376
 377             MINMAX
 378             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
 379                 m128 b = _mm_min_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
 380                 b = _mm_min_ps(b, _mm_movehl_ps(b, b));   // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
 381                 b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3
 382                 return _mm_cvtss_f32(b);
 383             }
 384             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
 385                 m128 b = _mm_max_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
 386                 b = _mm_max_ps(b, _mm_movehl_ps(b, b));   // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
 387                 b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3
 388                 return _mm_cvtss_f32(b);
 389             }
 390             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
 391                 m128 b = _mm_mul_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
 392                 b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
 393                 b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
 394                 return _mm_cvtss_f32(b);
 395             }
 396             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
 397                 m128 b = _mm_add_ps(avx_cast<m128>(a), _mm256_extractf128_ps(a, 1));
 398                 b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
 399                 b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
 400                 return _mm_cvtss_f32(b);
 401             }
 402 #undef SUFFIX
 403             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
 404                 return _mm256_round_ps(a, _MM_FROUND_NINT);
 405             }
 406         };
 407
 408         template<> struct VectorHelper<sfloat> : public VectorHelper<float> {};
 409
 410         template<> struct VectorHelper<int> {
 411             typedef int EntryType;
 412             typedef m256i VectorType;
 413 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
 414             typedef const VectorType & VTArg;
 415 #else
 416             typedef const VectorType VTArg;
 417 #endif
 418             typedef long long ConcatType;
 419 #define SUFFIX si256
 420
 421             OP_(or_) OP_(and_) OP_(xor_)
 422             static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
 423             static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
 424 #undef SUFFIX
 425 #define SUFFIX epi32
 426             static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
 427
 428             static Vc_INTRINSIC VectorType Vc_CONST set(const int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
 429             static Vc_INTRINSIC VectorType Vc_CONST set(const int a, const int b, const int c, const int d,
 430                     const int e, const int f, const int g, const int h) {
 431                 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
 432
 433             static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); }
 434
 435             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
 436                 return CAT(_mm256_slli_, SUFFIX)(a, shift);
 437             }
 438             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
 439                 return CAT(_mm256_srai_, SUFFIX)(a, shift);
 440             }
 441             OP1(abs)
 442
 443             MINMAX
 444             static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) {
 445                 m128i b = _mm_min_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
 446                 b = _mm_min_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 447                 b = _mm_min_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
 448                 return _mm_cvtsi128_si32(b);
 449             }
 450             static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) {
 451                 m128i b = _mm_max_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
 452                 b = _mm_max_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 453                 b = _mm_max_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
 454                 return _mm_cvtsi128_si32(b);
 455             }
 456             static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) {
 457                 m128i b = _mm_add_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
 458                 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 459                 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
 460                 return _mm_cvtsi128_si32(b);
 461             }
 462             static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) {
 463                 m128i b = _mm_mullo_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
 464                 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 465                 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
 466                 return _mm_cvtsi128_si32(b);
 467             }
 468
 469             static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); }
 470
 471             OP(add) OP(sub)
 472             OPcmp(eq)
 473             OPcmp(lt)
 474             OPcmp(gt)
 475             static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { m256i x = cmpeq(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
 476             static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { m256i x = cmplt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
 477             static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { m256i x = cmpgt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
 478             static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
 479 #undef SUFFIX
 480             static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; }
 481         };
 482
 483         template<> struct VectorHelper<unsigned int> {
 484             typedef unsigned int EntryType;
 485             typedef m256i VectorType;
 486 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
 487             typedef const VectorType & VTArg;
 488 #else
 489             typedef const VectorType VTArg;
 490 #endif
 491             typedef unsigned long long ConcatType;
 492 #define SUFFIX si256
 493             OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
 494             static Vc_INTRINSIC VectorType Vc_CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
 495             static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
 496
 497 #undef SUFFIX
 498 #define SUFFIX epu32
 499             static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
 500
 501             MINMAX
 502             static Vc_INTRINSIC EntryType Vc_CONST min(VTArg a) {
 503                 m128i b = _mm_min_epu32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
 504                 b = _mm_min_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 505                 b = _mm_min_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
 506                 return _mm_cvtsi128_si32(b);
 507             }
 508             static Vc_INTRINSIC EntryType Vc_CONST max(VTArg a) {
 509                 m128i b = _mm_max_epu32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
 510                 b = _mm_max_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 511                 b = _mm_max_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
 512                 return _mm_cvtsi128_si32(b);
 513             }
 514             static Vc_INTRINSIC EntryType Vc_CONST add(VTArg a) {
 515                 m128i b = _mm_add_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
 516                 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 517                 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
 518                 return _mm_cvtsi128_si32(b);
 519             }
 520             static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg a) {
 521                 m128i b = _mm_mullo_epi32(avx_cast<m128i>(a), _mm256_extractf128_si256(a, 1));
 522                 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 523                 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
 524                 return _mm_cvtsi128_si32(b);
 525             }
 526
 527             static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mullo_epi32(a, b); }
 528             static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); }
 529
 530 #undef SUFFIX
 531 #define SUFFIX epi32
 532             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
 533                 return CAT(_mm256_slli_, SUFFIX)(a, shift);
 534             }
 535             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
 536                 return CAT(_mm256_srli_, SUFFIX)(a, shift);
 537             }
 538             static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
 539             static Vc_INTRINSIC VectorType Vc_CONST set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d,
 540                     const unsigned int e, const unsigned int f, const unsigned int g, const unsigned int h) {
 541                 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
 542
 543             OP(add) OP(sub)
 544             OPcmp(eq)
 545             static Vc_INTRINSIC VectorType Vc_CONST cmpneq(VTArg a, VTArg b) { return _mm256_andnot_si256(cmpeq(a, b), _mm256_setallone_si256()); }
 546
 547 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
 548             static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) {
 549                 return _mm256_cmplt_epu32(a, b);
 550             }
 551             static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) {
 552                 return _mm256_cmpgt_epu32(a, b);
 553             }
 554 #else
 555             OPcmp(lt)
 556             OPcmp(gt)
 557 #endif
 558             static Vc_INTRINSIC VectorType Vc_CONST cmpnlt(VTArg a, VTArg b) { return _mm256_andnot_si256(cmplt(a, b), _mm256_setallone_si256()); }
 559             static Vc_INTRINSIC VectorType Vc_CONST cmple (VTArg a, VTArg b) { return _mm256_andnot_si256(cmpgt(a, b), _mm256_setallone_si256()); }
 560             static Vc_INTRINSIC VectorType Vc_CONST cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
 561
 562 #undef SUFFIX
 563             static Vc_INTRINSIC VectorType Vc_CONST round(VTArg a) { return a; }
 564         };
 565
 566         template<> struct VectorHelper<signed short> {
 567             typedef VectorTypeHelper<signed short>::Type VectorType;
 568 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
 569             typedef const VectorType & VTArg;
 570 #else
 571             typedef const VectorType VTArg;
 572 #endif
 573             typedef signed short EntryType;
 574             typedef int ConcatType;
 575
 576             static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); }
 577             static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); }
 578             static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); }
 579             static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); }
 580             static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
 581
 582 #define SUFFIX epi16
 583             static Vc_INTRINSIC VectorType Vc_CONST one() { return CAT(_mm_setone_, SUFFIX)(); }
 584
 585             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
 586                 return CAT(_mm_slli_, SUFFIX)(a, shift);
 587             }
 588             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
 589                 return CAT(_mm_srai_, SUFFIX)(a, shift);
 590             }
 591             static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
 592             static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
 593                     const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
 594                 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
 595             }
 596
 597             static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) {
 598                 v1 = add(mul(v1, v2), v3);
 599             }
 600
 601             static Vc_INTRINSIC VectorType Vc_CONST abs(VTArg a) { return _mm_abs_epi16(a); }
 602             static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); }
 603             static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epi16(a, b); }
 604             static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epi16(a, b); }
 605
 606             static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) {
 607                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 608                 VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
 609                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 610                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 611                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 612             }
 613             static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) {
 614                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 615                 VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
 616                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 617                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 618                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 619             }
 620             static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) {
 621                 VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
 622                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 623                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 624                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 625             }
 626             static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) {
 627                 VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
 628                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 629                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 630                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 631             }
 632
 633             static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); }
 634             static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); }
 635             static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); }
 636             static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); }
 637             static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); }
 638             static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { m128i x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 639             static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { m128i x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 640             static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { m128i x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 641             static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
 642 #undef SUFFIX
 643             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; }
 644         };
 645
 646         template<> struct VectorHelper<unsigned short> {
 647             typedef VectorTypeHelper<unsigned short>::Type VectorType;
 648 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
 649             typedef const VectorType & VTArg;
 650 #else
 651             typedef const VectorType VTArg;
 652 #endif
 653             typedef unsigned short EntryType;
 654             typedef unsigned int ConcatType;
 655
 656             static Vc_INTRINSIC VectorType Vc_CONST or_(VTArg a, VTArg b) { return _mm_or_si128(a, b); }
 657             static Vc_INTRINSIC VectorType Vc_CONST and_(VTArg a, VTArg b) { return _mm_and_si128(a, b); }
 658             static Vc_INTRINSIC VectorType Vc_CONST xor_(VTArg a, VTArg b) { return _mm_xor_si128(a, b); }
 659             static Vc_INTRINSIC VectorType Vc_CONST zero() { return _mm_setzero_si128(); }
 660             static Vc_INTRINSIC VectorType Vc_CONST notMaskedToZero(VTArg a, param128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
 661             static Vc_INTRINSIC VectorType Vc_CONST one() { return _mm_setone_epu16(); }
 662
 663             static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm_mullo_epi16(a, b); }
 664             static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm_min_epu16(a, b); }
 665             static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm_max_epu16(a, b); }
 666
 667 #define SUFFIX epi16
 668             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VTArg a, int shift) {
 669                 return CAT(_mm_slli_, SUFFIX)(a, shift);
 670             }
 671             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VTArg a, int shift) {
 672                 return CAT(_mm_srli_, SUFFIX)(a, shift);
 673             }
 674             static Vc_INTRINSIC EntryType Vc_CONST min(VTArg _a) {
 675                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 676                 VectorType a = min(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
 677                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 678                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 679                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 680             }
 681             static Vc_INTRINSIC EntryType Vc_CONST max(VTArg _a) {
 682                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 683                 VectorType a = max(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
 684                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 685                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 686                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 687             }
 688             static Vc_INTRINSIC EntryType Vc_CONST mul(VTArg _a) {
 689                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 690                 VectorType a = mul(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
 691                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 692                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 693                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 694             }
 695             static Vc_INTRINSIC EntryType Vc_CONST add(VTArg _a) {
 696                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 697                 VectorType a = add(_a, _mm_shuffle_epi32(_a, _MM_SHUFFLE(1, 0, 3, 2)));
 698                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 699                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 700                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 701             }
 702             static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
 703             static Vc_INTRINSIC VectorType Vc_CONST set(const EntryType a, const EntryType b, const EntryType c,
 704                     const EntryType d, const EntryType e, const EntryType f,
 705                     const EntryType g, const EntryType h) {
 706                 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
 707             }
 708             static Vc_INTRINSIC void fma(VectorType &v1, VTArg v2, VTArg v3) { v1 = add(mul(v1, v2), v3); }
 709
 710             static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm_add_epi16(a, b); }
 711             static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm_sub_epi16(a, b); }
 712             static Vc_INTRINSIC VectorType Vc_CONST cmpeq(VTArg a, VTArg b) { return _mm_cmpeq_epi16(a, b); }
 713             static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpneq(VTArg a, VTArg b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
 714
 715 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
 716             static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epu16(a, b); }
 717             static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epu16(a, b); }
 718 #else
 719             static Vc_INTRINSIC VectorType Vc_CONST cmplt(VTArg a, VTArg b) { return _mm_cmplt_epi16(a, b); }
 720             static Vc_INTRINSIC VectorType Vc_CONST cmpgt(VTArg a, VTArg b) { return _mm_cmpgt_epi16(a, b); }
 721 #endif
 722             static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnlt(VTArg a, VTArg b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
 723             static Vc_ALWAYS_INLINE Vc_CONST VectorType cmple (VTArg a, VTArg b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
 724             static Vc_ALWAYS_INLINE Vc_CONST VectorType cmpnle(VTArg a, VTArg b) { return cmpgt(a, b); }
 725 #undef SUFFIX
 726             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) { return a; }
 727         };
 728 #undef OP1
 729 #undef OP
 730 #undef OP_
 731 #undef OPx
 732 #undef OPcmp
 733
 734 template<> struct VectorHelper<char>
 735 {
 736     typedef VectorTypeHelper<char>::Type VectorType;
 737 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
 738     typedef const VectorType & VTArg;
 739 #else
 740     typedef const VectorType VTArg;
 741 #endif
 742     typedef char EntryType;
 743     typedef short ConcatType;
 744 };
 745
 746 template<> struct VectorHelper<unsigned char>
 747 {
 748     typedef VectorTypeHelper<unsigned char>::Type VectorType;
 749 #ifdef VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN
 750     typedef const VectorType & VTArg;
 751 #else
 752     typedef const VectorType VTArg;
 753 #endif
 754     typedef unsigned char EntryType;
 755     typedef unsigned short ConcatType;
 756 };
 757
 758 } // namespace AVX
 759 } // namespace Vc
 760 } // namespace AliRoot
 761
 762 #include "vectorhelper.tcc"
 763 #include "undomacros.h"
 764
 765 #endif // AVX_VECTORHELPER_H