Vc/include/Vc/avx/vectorhelper.h

   1 /*  This file is part of the Vc library.
   2
   3     Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
   4
   5     Vc is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU Lesser General Public License as
   7     published by the Free Software Foundation, either version 3 of
   8     the License, or (at your option) any later version.
   9
  10     Vc is distributed in the hope that it will be useful, but
  11     WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with Vc.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 */
  19
  20 #ifndef AVX_VECTORHELPER_H
  21 #define AVX_VECTORHELPER_H
  22
  23 #include <limits>
  24 #include "types.h"
  25 #include "intrinsics.h"
  26 #include "casts.h"
  27
  28 namespace Vc
  29 {
  30 namespace AVX
  31 {
  32 #define OP0(name, code) static inline VectorType name() { return code; }
  33 #define OP1(name, code) static inline VectorType name(const VectorType &a) { return code; }
  34 #define OP2(name, code) static inline VectorType name(const VectorType &a, const VectorType &b) { return code; }
  35 #define OP3(name, code) static inline VectorType name(const VectorType &a, const VectorType &b, const VectorType &c) { return code; }
  36
  37         template<> struct VectorHelper<_M256>
  38         {
  39             typedef _M256 VectorType;
  40             template<typename A> static VectorType load(const float *x, A) PURE;
  41             static void store(float *mem, const VectorType x, AlignedFlag);
  42             static void store(float *mem, const VectorType x, UnalignedFlag);
  43             static void store(float *mem, const VectorType x, StreamingAndAlignedFlag);
  44             static void store(float *mem, const VectorType x, StreamingAndUnalignedFlag);
  45             static void store(float *mem, const VectorType x, const VectorType m, AlignedFlag);
  46             static void store(float *mem, const VectorType x, const VectorType m, UnalignedFlag);
  47             static void store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
  48             static void store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
  49
  50             static VectorType cdab(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 3, 0, 1)); }
  51             static VectorType badc(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2)); }
  52             static VectorType aaaa(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)); }
  53             static VectorType bbbb(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)); }
  54             static VectorType cccc(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)); }
  55             static VectorType dddd(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)); }
  56             static VectorType dacb(VectorType x) { return _mm256_permute_ps(x, _MM_SHUFFLE(3, 0, 2, 1)); }
  57
  58             OP0(allone, _mm256_setallone_ps())
  59             OP0(zero, _mm256_setzero_ps())
  60             OP2(or_, _mm256_or_ps(a, b))
  61             OP2(xor_, _mm256_xor_ps(a, b))
  62             OP2(and_, _mm256_and_ps(a, b))
  63             OP2(andnot_, _mm256_andnot_ps(a, b))
  64             OP3(blend, _mm256_blendv_ps(a, b, c))
  65         };
  66
  67         template<> struct VectorHelper<_M256D>
  68         {
  69             typedef _M256D VectorType;
  70             template<typename A> static VectorType load(const double *x, A) PURE;
  71             static void store(double *mem, const VectorType x, AlignedFlag);
  72             static void store(double *mem, const VectorType x, UnalignedFlag);
  73             static void store(double *mem, const VectorType x, StreamingAndAlignedFlag);
  74             static void store(double *mem, const VectorType x, StreamingAndUnalignedFlag);
  75             static void store(double *mem, const VectorType x, const VectorType m, AlignedFlag);
  76             static void store(double *mem, const VectorType x, const VectorType m, UnalignedFlag);
  77             static void store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
  78             static void store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
  79
  80             static VectorType cdab(VectorType x) { return _mm256_permute_pd(x, 5); }
  81             static VectorType badc(VectorType x) { return _mm256_permute2f128_pd(x, x, 1); }
  82             // aaaa bbbb cccc dddd specialized in vector.tcc
  83             static VectorType dacb(VectorType x) {
  84                 const __m128d cb = avx_cast<__m128d>(_mm_alignr_epi8(avx_cast<__m128i>(lo128(x)),
  85                             avx_cast<__m128i>(hi128(x)), sizeof(double))); // XXX: lo and hi swapped?
  86                 const __m128d da = _mm_blend_pd(lo128(x), hi128(x), 0 + 2); // XXX: lo and hi swapped?
  87                 return concat(cb, da);
  88             }
  89
  90             OP0(allone, _mm256_setallone_pd())
  91             OP0(zero, _mm256_setzero_pd())
  92             OP2(or_, _mm256_or_pd(a, b))
  93             OP2(xor_, _mm256_xor_pd(a, b))
  94             OP2(and_, _mm256_and_pd(a, b))
  95             OP2(andnot_, _mm256_andnot_pd(a, b))
  96             OP3(blend, _mm256_blendv_pd(a, b, c))
  97         };
  98
  99         template<> struct VectorHelper<_M256I>
 100         {
 101             typedef _M256I VectorType;
 102             template<typename T> static VectorType load(const T *x, AlignedFlag) PURE;
 103             template<typename T> static VectorType load(const T *x, UnalignedFlag) PURE;
 104             template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) PURE;
 105             template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) PURE;
 106             template<typename T> static void store(T *mem, const VectorType x, AlignedFlag);
 107             template<typename T> static void store(T *mem, const VectorType x, UnalignedFlag);
 108             template<typename T> static void store(T *mem, const VectorType x, StreamingAndAlignedFlag);
 109             template<typename T> static void store(T *mem, const VectorType x, StreamingAndUnalignedFlag);
 110             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, AlignedFlag);
 111             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag);
 112             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
 113             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
 114
 115             static VectorType cdab(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(2, 3, 0, 1))); }
 116             static VectorType badc(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(1, 0, 3, 2))); }
 117             static VectorType aaaa(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(0, 0, 0, 0))); }
 118             static VectorType bbbb(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(1, 1, 1, 1))); }
 119             static VectorType cccc(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(2, 2, 2, 2))); }
 120             static VectorType dddd(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(3, 3, 3, 3))); }
 121             static VectorType dacb(VectorType x) { return avx_cast<VectorType>(_mm256_permute_ps(avx_cast<__m256>(x), _MM_SHUFFLE(3, 0, 2, 1))); }
 122
 123             OP0(allone, _mm256_setallone_si256())
 124             OP0(zero, _mm256_setzero_si256())
 125             OP2(or_, _mm256_or_si256(a, b))
 126             OP2(xor_, _mm256_xor_si256(a, b))
 127             OP2(and_, _mm256_and_si256(a, b))
 128             OP2(andnot_, _mm256_andnot_si256(a, b))
 129             OP3(blend, _mm256_blendv_epi8(a, b, c))
 130         };
 131
 132         template<> struct VectorHelper<__m128i>
 133         {
 134             typedef __m128i VectorType;
 135             template<typename T> static VectorType load(const T *x, AlignedFlag) PURE;
 136             template<typename T> static VectorType load(const T *x, UnalignedFlag) PURE;
 137             template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) PURE;
 138             template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) PURE;
 139             template<typename T> static void store(T *mem, const VectorType x, AlignedFlag);
 140             template<typename T> static void store(T *mem, const VectorType x, UnalignedFlag);
 141             template<typename T> static void store(T *mem, const VectorType x, StreamingAndAlignedFlag);
 142             template<typename T> static void store(T *mem, const VectorType x, StreamingAndUnalignedFlag);
 143             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, AlignedFlag);
 144             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag);
 145             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
 146             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
 147
 148             static VectorType cdab(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); }
 149             static VectorType badc(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)), _MM_SHUFFLE(1, 0, 3, 2)); }
 150             static VectorType aaaa(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0)); }
 151             static VectorType bbbb(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 1, 1, 1)), _MM_SHUFFLE(1, 1, 1, 1)); }
 152             static VectorType cccc(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(2, 2, 2, 2)); }
 153             static VectorType dddd(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); }
 154             static VectorType dacb(VectorType x) { return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, _MM_SHUFFLE(3, 0, 2, 1)), _MM_SHUFFLE(3, 0, 2, 1)); }
 155
 156             OP0(allone, _mm_setallone_si128())
 157             OP0(zero, _mm_setzero_si128())
 158             OP2(or_, _mm_or_si128(a, b))
 159             OP2(xor_, _mm_xor_si128(a, b))
 160             OP2(and_, _mm_and_si128(a, b))
 161             OP2(andnot_, _mm_andnot_si128(a, b))
 162             OP3(blend, _mm_blendv_epi8(a, b, c))
 163         };
 164 #undef OP1
 165 #undef OP2
 166 #undef OP3
 167
 168 #define OP1(op) \
 169         static inline VectorType INTRINSIC CONST op(const VectorType &a) { return CAT(_mm256_##op##_, SUFFIX)(a); }
 170 #define OP(op) \
 171         static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op##_ , SUFFIX)(a, b); }
 172 #define OP_(op) \
 173         static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op    , SUFFIX)(a, b); }
 174 #define OPx(op, op2) \
 175         static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_##op2##_, SUFFIX)(a, b); }
 176 #define OPcmp(op) \
 177         static inline VectorType INTRINSIC CONST cmp##op(const VectorType &a, const VectorType &b) { return CAT(_mm256_cmp##op##_, SUFFIX)(a, b); }
 178 #define OP_CAST_(op) \
 179         static inline VectorType INTRINSIC CONST op(const VectorType &a, const VectorType &b) { return CAT(_mm256_castps_, SUFFIX)( \
 180             _mm256_##op##ps(CAT(CAT(_mm256_cast, SUFFIX), _ps)(a), \
 181               CAT(CAT(_mm256_cast, SUFFIX), _ps)(b))); \
 182         }
 183 #define MINMAX \
 184         static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return CAT(_mm256_min_, SUFFIX)(a, b); } \
 185         static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return CAT(_mm256_max_, SUFFIX)(a, b); }
 186
 187         template<> struct VectorHelper<double> {
 188             typedef _M256D VectorType;
 189             typedef double EntryType;
 190             typedef double ConcatType;
 191 #define SUFFIX pd
 192
 193             static inline VectorType notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_pd(mask), a); }
 194             static inline VectorType set(const double a) { return CAT(_mm256_set1_, SUFFIX)(a); }
 195             static inline VectorType set(const double a, const double b, const double c, const double d) {
 196                 return CAT(_mm256_set_, SUFFIX)(a, b, c, d);
 197             }
 198             static inline VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
 199             static inline VectorType one()  { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.); }
 200
 201             static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
 202             static inline VectorType mul(VectorType a, VectorType b, _M256 _mask) {
 203                 _M256D mask = _mm256_castps_pd(_mask);
 204                 return _mm256_or_pd(
 205                     _mm256_and_pd(mask, _mm256_mul_pd(a, b)),
 206                     _mm256_andnot_pd(mask, a)
 207                     );
 208             }
 209             static inline VectorType div(VectorType a, VectorType b, _M256 _mask) {
 210                 _M256D mask = _mm256_castps_pd(_mask);
 211                 return _mm256_or_pd(
 212                     _mm256_and_pd(mask, _mm256_div_pd(a, b)),
 213                     _mm256_andnot_pd(mask, a)
 214                     );
 215             }
 216
 217             OP(add) OP(sub) OP(mul)
 218             OPcmp(eq) OPcmp(neq)
 219             OPcmp(lt) OPcmp(nlt)
 220             OPcmp(le) OPcmp(nle)
 221
 222             OP1(sqrt)
 223             static inline VectorType rsqrt(VectorType x) {
 224                 return _mm256_div_pd(one(), sqrt(x));
 225             }
 226             static inline VectorType reciprocal(VectorType x) {
 227                 return _mm256_div_pd(one(), x);
 228             }
 229             static inline VectorType isNaN(VectorType x) {
 230                 return _mm256_cmpunord_pd(x, x);
 231             }
 232             static inline VectorType isFinite(VectorType x) {
 233                 return _mm256_cmpord_pd(x, _mm256_mul_pd(zero(), x));
 234             }
 235             static inline VectorType abs(const VectorType a) {
 236                 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_pd());
 237             }
 238
 239             MINMAX
 240             static inline EntryType min(VectorType a) {
 241                 __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
 242                 b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
 243                 return _mm_cvtsd_f64(b);
 244             }
 245             static inline EntryType max(VectorType a) {
 246                 __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
 247                 b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
 248                 return _mm_cvtsd_f64(b);
 249             }
 250             static inline EntryType mul(VectorType a) {
 251                 __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
 252                 b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
 253                 return _mm_cvtsd_f64(b);
 254             }
 255             static inline EntryType add(VectorType a) {
 256                 __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
 257                 b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
 258                 return _mm_cvtsd_f64(b);
 259             }
 260 #undef SUFFIX
 261             static inline VectorType round(VectorType a) {
 262                 return _mm256_round_pd(a, _MM_FROUND_NINT);
 263             }
 264         };
 265
 266         template<> struct VectorHelper<float> {
 267             typedef float EntryType;
 268             typedef _M256 VectorType;
 269             typedef double ConcatType;
 270 #define SUFFIX ps
 271
 272             static inline VectorType notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(mask, a); }
 273             static inline VectorType set(const float a) { return CAT(_mm256_set1_, SUFFIX)(a); }
 274             static inline VectorType set(const float a, const float b, const float c, const float d,
 275                     const float e, const float f, const float g, const float h) {
 276                 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
 277             static inline VectorType zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
 278             static inline VectorType one()  { return CAT(_mm256_setone_, SUFFIX)(); }// set(1.f); }
 279             static inline _M256 concat(_M256D a, _M256D b) { return _mm256_insertf128_ps(avx_cast<_M256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
 280
 281             static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
 282             static inline VectorType mul(VectorType a, VectorType b, _M256 mask) {
 283                 return _mm256_or_ps(
 284                     _mm256_and_ps(mask, _mm256_mul_ps(a, b)),
 285                     _mm256_andnot_ps(mask, a)
 286                     );
 287             }
 288             static inline VectorType div(VectorType a, VectorType b, _M256 mask) {
 289                 return _mm256_or_ps(
 290                     _mm256_and_ps(mask, _mm256_div_ps(a, b)),
 291                     _mm256_andnot_ps(mask, a)
 292                     );
 293             }
 294
 295             OP(add) OP(sub) OP(mul)
 296             OPcmp(eq) OPcmp(neq)
 297             OPcmp(lt) OPcmp(nlt)
 298             OPcmp(le) OPcmp(nle)
 299
 300             OP1(sqrt) OP1(rsqrt)
 301             static inline VectorType isNaN(VectorType x) {
 302                 return _mm256_cmpunord_ps(x, x);
 303             }
 304             static inline VectorType isFinite(VectorType x) {
 305                 return _mm256_cmpord_ps(x, _mm256_mul_ps(zero(), x));
 306             }
 307             static inline VectorType reciprocal(VectorType x) {
 308                 return _mm256_rcp_ps(x);
 309             }
 310             static inline VectorType abs(const VectorType a) {
 311                 return CAT(_mm256_and_, SUFFIX)(a, _mm256_setabsmask_ps());
 312             }
 313
 314             MINMAX
 315             static inline EntryType min(VectorType a) {
 316                 __m128 b = _mm_min_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
 317                 b = _mm_min_ps(b, _mm_movehl_ps(b, b));   // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
 318                 b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3
 319                 return _mm_cvtss_f32(b);
 320             }
 321             static inline EntryType max(VectorType a) {
 322                 __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
 323                 b = _mm_max_ps(b, _mm_movehl_ps(b, b));   // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
 324                 b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3
 325                 return _mm_cvtss_f32(b);
 326             }
 327             static inline EntryType mul(VectorType a) {
 328                 __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
 329                 b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
 330                 b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
 331                 return _mm_cvtss_f32(b);
 332             }
 333             static inline EntryType add(VectorType a) {
 334                 __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
 335                 b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
 336                 b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
 337                 return _mm_cvtss_f32(b);
 338             }
 339 #undef SUFFIX
 340             static inline VectorType round(VectorType a) {
 341                 return _mm256_round_ps(a, _MM_FROUND_NINT);
 342             }
 343         };
 344
 345         template<> struct VectorHelper<sfloat> : public VectorHelper<float> {};
 346
 347         template<> struct VectorHelper<int> {
 348             typedef int EntryType;
 349             typedef _M256I VectorType;
 350             typedef long long ConcatType;
 351 #define SUFFIX si256
 352
 353             OP_(or_) OP_(and_) OP_(xor_)
 354             static inline VectorType INTRINSIC CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
 355             static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
 356 #undef SUFFIX
 357 #define SUFFIX epi32
 358             static inline VectorType INTRINSIC CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
 359
 360             static inline VectorType INTRINSIC CONST set(const int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
 361             static inline VectorType INTRINSIC CONST set(const int a, const int b, const int c, const int d,
 362                     const int e, const int f, const int g, const int h) {
 363                 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
 364
 365             static inline void INTRINSIC CONST multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
 366
 367             static inline VectorType shiftLeft(VectorType a, int shift) {
 368                 return CAT(_mm256_slli_, SUFFIX)(a, shift);
 369             }
 370             static inline VectorType shiftRight(VectorType a, int shift) {
 371                 return CAT(_mm256_srai_, SUFFIX)(a, shift);
 372             }
 373             OP1(abs)
 374
 375             MINMAX
 376             static inline EntryType INTRINSIC CONST min(VectorType a) {
 377                 __m128i b = _mm_min_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
 378                 b = _mm_min_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 379                 b = _mm_min_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
 380                 return _mm_cvtsi128_si32(b);
 381             }
 382             static inline EntryType INTRINSIC CONST max(VectorType a) {
 383                 __m128i b = _mm_max_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
 384                 b = _mm_max_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 385                 b = _mm_max_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
 386                 return _mm_cvtsi128_si32(b);
 387             }
 388             static inline EntryType INTRINSIC CONST add(VectorType a) {
 389                 __m128i b = _mm_add_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
 390                 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 391                 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
 392                 return _mm_cvtsi128_si32(b);
 393             }
 394             static inline EntryType INTRINSIC CONST mul(VectorType a) {
 395                 __m128i b = _mm_mullo_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
 396                 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 397                 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
 398                 return _mm_cvtsi128_si32(b);
 399             }
 400
 401             static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm256_mullo_epi32(a, b); }
 402
 403             OP(add) OP(sub)
 404             OPcmp(eq)
 405             OPcmp(lt)
 406             OPcmp(gt)
 407             static inline VectorType INTRINSIC CONST cmpneq(const VectorType &a, const VectorType &b) { _M256I x = cmpeq(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
 408             static inline VectorType INTRINSIC CONST cmpnlt(const VectorType &a, const VectorType &b) { _M256I x = cmplt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
 409             static inline VectorType INTRINSIC CONST cmple (const VectorType &a, const VectorType &b) { _M256I x = cmpgt(a, b); return _mm256_andnot_si256(x, _mm256_setallone_si256()); }
 410             static inline VectorType INTRINSIC CONST cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
 411 #undef SUFFIX
 412             static inline VectorType INTRINSIC CONST round(VectorType a) { return a; }
 413         };
 414
 415         template<> struct VectorHelper<unsigned int> {
 416             typedef unsigned int EntryType;
 417             typedef _M256I VectorType;
 418             typedef unsigned long long ConcatType;
 419 #define SUFFIX si256
 420             OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
 421             static inline VectorType INTRINSIC CONST zero() { return CAT(_mm256_setzero_, SUFFIX)(); }
 422             static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, _M256 mask) { return CAT(_mm256_and_, SUFFIX)(_mm256_castps_si256(mask), a); }
 423
 424 #undef SUFFIX
 425 #define SUFFIX epu32
 426             static inline VectorType INTRINSIC CONST one() { return CAT(_mm256_setone_, SUFFIX)(); }
 427
 428             MINMAX
 429             static inline EntryType INTRINSIC CONST min(VectorType a) {
 430                 __m128i b = _mm_min_epu32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
 431                 b = _mm_min_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 432                 b = _mm_min_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
 433                 return _mm_cvtsi128_si32(b);
 434             }
 435             static inline EntryType INTRINSIC CONST max(VectorType a) {
 436                 __m128i b = _mm_max_epu32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
 437                 b = _mm_max_epu32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 438                 b = _mm_max_epu32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2))); // using lo_epi16 for speed here
 439                 return _mm_cvtsi128_si32(b);
 440             }
 441             static inline EntryType INTRINSIC CONST add(VectorType a) {
 442                 __m128i b = _mm_add_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
 443                 b = _mm_add_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 444                 b = _mm_add_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
 445                 return _mm_cvtsi128_si32(b);
 446             }
 447             static inline EntryType INTRINSIC CONST mul(VectorType a) {
 448                 __m128i b = _mm_mullo_epi32(avx_cast<__m128i>(a), _mm256_extractf128_si256(a, 1));
 449                 b = _mm_mullo_epi32(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2)));
 450                 b = _mm_mullo_epi32(b, _mm_shufflelo_epi16(b, _MM_SHUFFLE(1, 0, 3, 2)));
 451                 return _mm_cvtsi128_si32(b);
 452             }
 453
 454             static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm256_mullo_epi32(a, b); }
 455
 456 #undef SUFFIX
 457 #define SUFFIX epi32
 458             static inline VectorType shiftLeft(VectorType a, int shift) {
 459                 return CAT(_mm256_slli_, SUFFIX)(a, shift);
 460             }
 461             static inline VectorType shiftRight(VectorType a, int shift) {
 462                 return CAT(_mm256_srli_, SUFFIX)(a, shift);
 463             }
 464             static inline VectorType INTRINSIC CONST set(const unsigned int a) { return CAT(_mm256_set1_, SUFFIX)(a); }
 465             static inline VectorType INTRINSIC CONST set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d,
 466                     const unsigned int e, const unsigned int f, const unsigned int g, const unsigned int h) {
 467                 return CAT(_mm256_set_, SUFFIX)(a, b, c, d, e, f, g, h); }
 468
 469             OP(add) OP(sub)
 470             OPcmp(eq)
 471             static inline VectorType INTRINSIC CONST cmpneq(const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmpeq(a, b), _mm256_setallone_si256()); }
 472
 473 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
 474             static inline VectorType INTRINSIC CONST cmplt(const VectorType &a, const VectorType &b) {
 475                 return _mm256_cmplt_epu32(a, b);
 476             }
 477             static inline VectorType INTRINSIC CONST cmpgt(const VectorType &a, const VectorType &b) {
 478                 return _mm256_cmpgt_epu32(a, b);
 479             }
 480 #else
 481             OPcmp(lt)
 482             OPcmp(gt)
 483 #endif
 484             static inline VectorType INTRINSIC CONST cmpnlt(const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmplt(a, b), _mm256_setallone_si256()); }
 485             static inline VectorType INTRINSIC CONST cmple (const VectorType &a, const VectorType &b) { return _mm256_andnot_si256(cmpgt(a, b), _mm256_setallone_si256()); }
 486             static inline VectorType INTRINSIC CONST cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
 487
 488 #undef SUFFIX
 489             static inline VectorType INTRINSIC CONST round(VectorType a) { return a; }
 490         };
 491
 492         template<> struct VectorHelper<signed short> {
 493             typedef VectorTypeHelper<signed short>::Type VectorType;
 494             typedef signed short EntryType;
 495             typedef int ConcatType;
 496
 497             static inline VectorType INTRINSIC CONST or_(VectorType a, VectorType b) { return _mm_or_si128(a, b); }
 498             static inline VectorType INTRINSIC CONST and_(VectorType a, VectorType b) { return _mm_and_si128(a, b); }
 499             static inline VectorType INTRINSIC CONST xor_(VectorType a, VectorType b) { return _mm_xor_si128(a, b); }
 500             static inline VectorType INTRINSIC CONST zero() { return _mm_setzero_si128(); }
 501             static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, __m128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
 502
 503 #define SUFFIX epi16
 504             static inline VectorType INTRINSIC CONST one() { return CAT(_mm_setone_, SUFFIX)(); }
 505
 506             static inline VectorType shiftLeft(VectorType a, int shift) {
 507                 return CAT(_mm_slli_, SUFFIX)(a, shift);
 508             }
 509             static inline VectorType shiftRight(VectorType a, int shift) {
 510                 return CAT(_mm_srai_, SUFFIX)(a, shift);
 511             }
 512             static inline VectorType INTRINSIC CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
 513             static inline VectorType INTRINSIC CONST set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
 514                     const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
 515                 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
 516             }
 517
 518             static inline void INTRINSIC CONST multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) {
 519                 v1 = add(mul(v1, v2), v3); }
 520
 521             static inline VectorType INTRINSIC CONST abs(VectorType a) { return _mm_abs_epi16(a); }
 522             static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm_mullo_epi16(a, b); }
 523             static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return _mm_min_epi16(a, b); }
 524             static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return _mm_max_epi16(a, b); }
 525
 526             static inline EntryType INTRINSIC CONST min(VectorType a) {
 527                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 528                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 529                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 530                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 531                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 532             }
 533             static inline EntryType INTRINSIC CONST max(VectorType a) {
 534                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 535                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 536                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 537                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 538                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 539             }
 540             static inline EntryType INTRINSIC CONST mul(VectorType a) {
 541                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 542                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 543                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 544                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 545             }
 546             static inline EntryType INTRINSIC CONST add(VectorType a) {
 547                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 548                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 549                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 550                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 551             }
 552
 553             static inline VectorType INTRINSIC CONST add(VectorType a, VectorType b) { return _mm_add_epi16(a, b); }
 554             static inline VectorType INTRINSIC CONST sub(VectorType a, VectorType b) { return _mm_sub_epi16(a, b); }
 555             static inline VectorType INTRINSIC CONST cmpeq(VectorType a, VectorType b) { return _mm_cmpeq_epi16(a, b); }
 556             static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epi16(a, b); }
 557             static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epi16(a, b); }
 558             static inline VectorType cmpneq(const VectorType &a, const VectorType &b) { __m128i x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 559             static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) { __m128i x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 560             static inline VectorType cmple (const VectorType &a, const VectorType &b) { __m128i x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 561             static inline VectorType cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
 562 #undef SUFFIX
 563             static inline VectorType round(VectorType a) { return a; }
 564         };
 565
 566         template<> struct VectorHelper<unsigned short> {
 567             typedef VectorTypeHelper<unsigned short>::Type VectorType;
 568             typedef unsigned short EntryType;
 569             typedef unsigned int ConcatType;
 570
 571             static inline VectorType INTRINSIC CONST or_(VectorType a, VectorType b) { return _mm_or_si128(a, b); }
 572             static inline VectorType INTRINSIC CONST and_(VectorType a, VectorType b) { return _mm_and_si128(a, b); }
 573             static inline VectorType INTRINSIC CONST xor_(VectorType a, VectorType b) { return _mm_xor_si128(a, b); }
 574             static inline VectorType INTRINSIC CONST zero() { return _mm_setzero_si128(); }
 575             static inline VectorType INTRINSIC CONST notMaskedToZero(VectorType a, __m128 mask) { return _mm_and_si128(_mm_castps_si128(mask), a); }
 576             static inline VectorType INTRINSIC CONST one() { return _mm_setone_epu16(); }
 577
 578             static inline VectorType INTRINSIC CONST mul(VectorType a, VectorType b) { return _mm_mullo_epi16(a, b); }
 579             static inline VectorType INTRINSIC CONST min(VectorType a, VectorType b) { return _mm_min_epu16(a, b); }
 580             static inline VectorType INTRINSIC CONST max(VectorType a, VectorType b) { return _mm_max_epu16(a, b); }
 581
 582 #define SUFFIX epi16
 583             static inline VectorType shiftLeft(VectorType a, int shift) {
 584                 return CAT(_mm_slli_, SUFFIX)(a, shift);
 585             }
 586             static inline VectorType shiftRight(VectorType a, int shift) {
 587                 return CAT(_mm_srli_, SUFFIX)(a, shift);
 588             }
 589             static inline EntryType INTRINSIC CONST min(VectorType a) {
 590                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 591                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 592                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 593                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 594                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 595             }
 596             static inline EntryType INTRINSIC CONST max(VectorType a) {
 597                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 598                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 599                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 600                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 601                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 602             }
 603             static inline EntryType INTRINSIC CONST mul(VectorType a) {
 604                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 605                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 606                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 607                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 608                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 609             }
 610             static inline EntryType INTRINSIC CONST add(VectorType a) {
 611                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 612                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 613                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 614                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 615                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 616             }
 617             static inline VectorType INTRINSIC CONST set(const EntryType a) { return CAT(_mm_set1_, SUFFIX)(a); }
 618             static inline VectorType INTRINSIC CONST set(const EntryType a, const EntryType b, const EntryType c,
 619                     const EntryType d, const EntryType e, const EntryType f,
 620                     const EntryType g, const EntryType h) {
 621                 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
 622             }
 623
 624             static inline VectorType INTRINSIC CONST add(VectorType a, VectorType b) { return _mm_add_epi16(a, b); }
 625             static inline VectorType INTRINSIC CONST sub(VectorType a, VectorType b) { return _mm_sub_epi16(a, b); }
 626             static inline VectorType INTRINSIC CONST cmpeq(VectorType a, VectorType b) { return _mm_cmpeq_epi16(a, b); }
 627             static inline VectorType cmpneq(const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
 628
 629 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
 630             static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epu16(a, b); }
 631             static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epu16(a, b); }
 632 #else
 633             static inline VectorType INTRINSIC CONST cmplt(VectorType a, VectorType b) { return _mm_cmplt_epi16(a, b); }
 634             static inline VectorType INTRINSIC CONST cmpgt(VectorType a, VectorType b) { return _mm_cmpgt_epi16(a, b); }
 635 #endif
 636             static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
 637             static inline VectorType cmple (const VectorType &a, const VectorType &b) { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
 638             static inline VectorType cmpnle(const VectorType &a, const VectorType &b) { return cmpgt(a, b); }
 639 #undef SUFFIX
 640             static inline VectorType round(VectorType a) { return a; }
 641         };
 642 #undef OP1
 643 #undef OP
 644 #undef OP_
 645 #undef OPx
 646 #undef OPcmp
 647
 648 template<> struct VectorHelper<char>
 649 {
 650     typedef VectorTypeHelper<char>::Type VectorType;
 651     typedef char EntryType;
 652     typedef short ConcatType;
 653 };
 654
 655 template<> struct VectorHelper<unsigned char>
 656 {
 657     typedef VectorTypeHelper<unsigned char>::Type VectorType;
 658     typedef unsigned char EntryType;
 659     typedef unsigned short ConcatType;
 660 };
 661
 662 } // namespace AVX
 663 } // namespace Vc
 664
 665 #include "vectorhelper.tcc"
 666
 667 #endif // AVX_VECTORHELPER_H