Vc/include/Vc/sse/vectorhelper.h

   1 /*  This file is part of the Vc library.
   2
   3     Copyright (C) 2009-2011 Matthias Kretz <kretz@kde.org>
   4
   5     Vc is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU Lesser General Public License as
   7     published by the Free Software Foundation, either version 3 of
   8     the License, or (at your option) any later version.
   9
  10     Vc is distributed in the hope that it will be useful, but
  11     WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with Vc.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 */
  19
  20 #ifndef SSE_VECTORHELPER_H
  21 #define SSE_VECTORHELPER_H
  22
  23 #include "types.h"
  24 #include <limits>
  25
  26 namespace Vc
  27 {
  28 namespace SSE
  29 {
  30     template<typename VectorType, unsigned int Size> struct SortHelper
  31     {
  32         static VectorType sort(VectorType) PURE;
  33     };
  34     template<unsigned int Size> struct SortHelper<M256, Size>
  35     {
  36         static M256 sort(const M256 &) PURE;
  37     };
  38
  39 #undef OP_DECL
  40 #undef PARENT_DATA
  41 #undef PARENT_DATA_CONST
  42
  43 #define OP0(name, code) static inline VectorType name() PURE { return code; }
  44 #define OP1(name, code) static inline VectorType name(const VectorType &a) PURE { return code; }
  45 #define OP2(name, code) static inline VectorType name(const VectorType &a, const VectorType &b) PURE { return code; }
  46 #define OP3(name, code) static inline VectorType name(const VectorType &a, const VectorType &b, const VectorType &c) PURE { return code; }
  47
  48         template<> struct VectorHelper<_M128>
  49         {
  50             typedef _M128 VectorType;
  51             template<typename A> static VectorType load(const float *x, A) PURE;
  52             static void store(float *mem, const VectorType x, AlignedFlag);
  53             static void store(float *mem, const VectorType x, UnalignedFlag);
  54             static void store(float *mem, const VectorType x, StreamingAndAlignedFlag);
  55             static void store(float *mem, const VectorType x, StreamingAndUnalignedFlag);
  56             static void store(float *mem, const VectorType x, const VectorType m, AlignedFlag);
  57             static void store(float *mem, const VectorType x, const VectorType m, UnalignedFlag);
  58             static void store(float *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
  59             static void store(float *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
  60
  61             OP0(allone, _mm_setallone_ps())
  62             OP0(zero, _mm_setzero_ps())
  63             OP2(or_, _mm_or_ps(a, b))
  64             OP2(xor_, _mm_xor_ps(a, b))
  65             OP2(and_, _mm_and_ps(a, b))
  66             OP2(andnot_, _mm_andnot_ps(a, b))
  67             OP3(blend, _mm_blendv_ps(a, b, c))
  68         };
  69
  70
  71         template<> struct VectorHelper<M256>
  72         {
  73             typedef M256 VectorType;
  74             template<typename A> static VectorType load(const float *x, A) PURE;
  75             static void store(float *mem, const VectorType &x, AlignedFlag);
  76             static void store(float *mem, const VectorType &x, UnalignedFlag);
  77             static void store(float *mem, const VectorType &x, StreamingAndAlignedFlag);
  78             static void store(float *mem, const VectorType &x, StreamingAndUnalignedFlag);
  79             static void store(float *mem, const VectorType &x, const VectorType &m, AlignedFlag);
  80             static void store(float *mem, const VectorType &x, const VectorType &m, UnalignedFlag);
  81             static void store(float *mem, const VectorType &x, const VectorType &m, StreamingAndAlignedFlag);
  82             static void store(float *mem, const VectorType &x, const VectorType &m, StreamingAndUnalignedFlag);
  83
  84             OP0(allone, VectorType::create(_mm_setallone_ps(), _mm_setallone_ps()))
  85             OP0(zero, VectorType::create(_mm_setzero_ps(), _mm_setzero_ps()))
  86             OP2(or_, VectorType::create(_mm_or_ps(a[0], b[0]), _mm_or_ps(a[1], b[1])))
  87             OP2(xor_, VectorType::create(_mm_xor_ps(a[0], b[0]), _mm_xor_ps(a[1], b[1])))
  88             OP2(and_, VectorType::create(_mm_and_ps(a[0], b[0]), _mm_and_ps(a[1], b[1])))
  89             OP2(andnot_, VectorType::create(_mm_andnot_ps(a[0], b[0]), _mm_andnot_ps(a[1], b[1])))
  90             OP3(blend, VectorType::create(_mm_blendv_ps(a[0], b[0], c[0]), _mm_blendv_ps(a[1], b[1], c[1])))
  91         };
  92
  93         template<> struct VectorHelper<_M128D>
  94         {
  95             typedef _M128D VectorType;
  96             template<typename A> static VectorType load(const double *x, A) PURE;
  97             static void store(double *mem, const VectorType x, AlignedFlag);
  98             static void store(double *mem, const VectorType x, UnalignedFlag);
  99             static void store(double *mem, const VectorType x, StreamingAndAlignedFlag);
 100             static void store(double *mem, const VectorType x, StreamingAndUnalignedFlag);
 101             static void store(double *mem, const VectorType x, const VectorType m, AlignedFlag);
 102             static void store(double *mem, const VectorType x, const VectorType m, UnalignedFlag);
 103             static void store(double *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
 104             static void store(double *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
 105
 106             OP0(allone, _mm_setallone_pd())
 107             OP0(zero, _mm_setzero_pd())
 108             OP2(or_, _mm_or_pd(a, b))
 109             OP2(xor_, _mm_xor_pd(a, b))
 110             OP2(and_, _mm_and_pd(a, b))
 111             OP2(andnot_, _mm_andnot_pd(a, b))
 112             OP3(blend, _mm_blendv_pd(a, b, c))
 113         };
 114
 115         template<> struct VectorHelper<_M128I>
 116         {
 117             typedef _M128I VectorType;
 118             template<typename T> static VectorType load(const T *x, AlignedFlag) PURE;
 119             template<typename T> static VectorType load(const T *x, UnalignedFlag) PURE;
 120             template<typename T> static VectorType load(const T *x, StreamingAndAlignedFlag) PURE;
 121             template<typename T> static VectorType load(const T *x, StreamingAndUnalignedFlag) PURE;
 122             template<typename T> static void store(T *mem, const VectorType x, AlignedFlag);
 123             template<typename T> static void store(T *mem, const VectorType x, UnalignedFlag);
 124             template<typename T> static void store(T *mem, const VectorType x, StreamingAndAlignedFlag);
 125             template<typename T> static void store(T *mem, const VectorType x, StreamingAndUnalignedFlag);
 126             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, AlignedFlag);
 127             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, UnalignedFlag);
 128             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndAlignedFlag);
 129             template<typename T> static void store(T *mem, const VectorType x, const VectorType m, StreamingAndUnalignedFlag);
 130
 131             OP0(allone, _mm_setallone_si128())
 132             OP0(zero, _mm_setzero_si128())
 133             OP2(or_, _mm_or_si128(a, b))
 134             OP2(xor_, _mm_xor_si128(a, b))
 135             OP2(and_, _mm_and_si128(a, b))
 136             OP2(andnot_, _mm_andnot_si128(a, b))
 137             OP3(blend, _mm_blendv_epi8(a, b, c))
 138         };
 139
 140 #undef OP1
 141 #undef OP2
 142 #undef OP3
 143
 144 #define OP1(op) \
 145         static inline VectorType op(const VectorType &a) PURE { return CAT(_mm_##op##_, SUFFIX)(a); }
 146 #define OP(op) \
 147         static inline VectorType op(const VectorType &a, const VectorType &b) PURE { return CAT(_mm_##op##_ , SUFFIX)(a, b); }
 148 #define OP_(op) \
 149         static inline VectorType op(const VectorType &a, const VectorType &b) PURE { return CAT(_mm_##op    , SUFFIX)(a, b); }
 150 #define OPx(op, op2) \
 151         static inline VectorType op(const VectorType &a, const VectorType &b) PURE { return CAT(_mm_##op2##_, SUFFIX)(a, b); }
 152 #define OPcmp(op) \
 153         static inline VectorType cmp##op(const VectorType &a, const VectorType &b) PURE { return CAT(_mm_cmp##op##_, SUFFIX)(a, b); }
 154 #define OP_CAST_(op) \
 155         static inline VectorType op(const VectorType &a, const VectorType &b) PURE { return CAT(_mm_castps_, SUFFIX)( \
 156             _mm_##op##ps(CAT(CAT(_mm_cast, SUFFIX), _ps)(a), \
 157               CAT(CAT(_mm_cast, SUFFIX), _ps)(b))); \
 158         }
 159 #define MINMAX \
 160         static inline VectorType min(VectorType a, VectorType b) PURE { return CAT(_mm_min_, SUFFIX)(a, b); } \
 161         static inline VectorType max(VectorType a, VectorType b) PURE { return CAT(_mm_max_, SUFFIX)(a, b); }
 162
 163         template<> struct VectorHelper<double> {
 164             typedef _M128D VectorType;
 165             typedef double EntryType;
 166 #define SUFFIX pd
 167
 168             OP_(or_) OP_(and_) OP_(xor_)
 169             static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(_mm_castps_pd(mask), a); }
 170             static inline VectorType set(const double a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
 171             static inline VectorType set(const double a, const double b) PURE { return CAT(_mm_set_, SUFFIX)(a, b); }
 172             static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
 173             static inline VectorType one()  PURE { return CAT(_mm_setone_, SUFFIX)(); }// set(1.); }
 174
 175             static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
 176             static inline VectorType mul(VectorType a, VectorType b, _M128 _mask) PURE {
 177                 _M128D mask = _mm_castps_pd(_mask);
 178                 return _mm_or_pd(
 179                     _mm_and_pd(mask, _mm_mul_pd(a, b)),
 180                     _mm_andnot_pd(mask, a)
 181                     );
 182             }
 183
 184             OP(add) OP(sub) OP(mul)
 185             OPcmp(eq) OPcmp(neq)
 186             OPcmp(lt) OPcmp(nlt)
 187             OPcmp(le) OPcmp(nle)
 188
 189             OP1(sqrt)
 190             static inline VectorType rsqrt(VectorType x) PURE {
 191                 return _mm_div_pd(one(), sqrt(x));
 192             }
 193             static inline VectorType reciprocal(VectorType x) PURE {
 194                 return _mm_div_pd(one(), x);
 195             }
 196             static inline VectorType isNaN(VectorType x) PURE {
 197                 return _mm_cmpunord_pd(x, x);
 198             }
 199             static inline VectorType isFinite(VectorType x) PURE {
 200                 return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));
 201             }
 202             static inline VectorType abs(const VectorType a) PURE {
 203                 return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_pd());
 204             }
 205
 206             MINMAX
 207             static inline EntryType min(VectorType a) PURE {
 208                 a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
 209                 return _mm_cvtsd_f64(a);
 210             }
 211             static inline EntryType max(VectorType a) PURE {
 212                 a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
 213                 return _mm_cvtsd_f64(a);
 214             }
 215             static inline EntryType mul(VectorType a) PURE {
 216                 a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
 217                 return _mm_cvtsd_f64(a);
 218             }
 219             static inline EntryType add(VectorType a) PURE {
 220                 a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
 221                 return _mm_cvtsd_f64(a);
 222             }
 223 #undef SUFFIX
 224             static inline VectorType round(VectorType a) PURE {
 225 #if VC_IMPL_SSE4_1
 226                 return _mm_round_pd(a, _MM_FROUND_NINT);
 227 #else
 228                 //XXX: slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
 229                 return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
 230 #endif
 231             }
 232         };
 233
 234         template<> struct VectorHelper<float> {
 235             typedef float EntryType;
 236             typedef _M128 VectorType;
 237 #define SUFFIX ps
 238
 239             OP_(or_) OP_(and_) OP_(xor_)
 240             static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(mask, a); }
 241             static inline VectorType set(const float a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
 242             static inline VectorType set(const float a, const float b, const float c, const float d) PURE { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }
 243             static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
 244             static inline VectorType one()  PURE { return CAT(_mm_setone_, SUFFIX)(); }// set(1.f); }
 245             static inline _M128 concat(_M128D a, _M128D b) PURE { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }
 246
 247             static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
 248             static inline VectorType mul(VectorType a, VectorType b, _M128 mask) PURE {
 249                 return _mm_or_ps(
 250                     _mm_and_ps(mask, _mm_mul_ps(a, b)),
 251                     _mm_andnot_ps(mask, a)
 252                     );
 253             }
 254
 255             OP(add) OP(sub) OP(mul)
 256             OPcmp(eq) OPcmp(neq)
 257             OPcmp(lt) OPcmp(nlt)
 258             OPcmp(le) OPcmp(nle)
 259
 260             OP1(sqrt) OP1(rsqrt)
 261             static inline VectorType isNaN(VectorType x) PURE {
 262                 return _mm_cmpunord_ps(x, x);
 263             }
 264             static inline VectorType isFinite(VectorType x) PURE {
 265                 return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));
 266             }
 267             static inline VectorType reciprocal(VectorType x) PURE {
 268                 return _mm_rcp_ps(x);
 269             }
 270             static inline VectorType abs(const VectorType a) PURE {
 271                 return CAT(_mm_and_, SUFFIX)(a, _mm_setabsmask_ps());
 272             }
 273
 274             MINMAX
 275             static inline EntryType min(VectorType a) PURE {
 276                 a = _mm_min_ps(a, _mm_movehl_ps(a, a));   // a = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
 277                 a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = min(a0, a1), a1, a2, a3
 278                 return _mm_cvtss_f32(a);
 279             }
 280             static inline EntryType max(VectorType a) PURE {
 281                 a = _mm_max_ps(a, _mm_movehl_ps(a, a));   // a = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
 282                 a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = max(a0, a1), a1, a2, a3
 283                 return _mm_cvtss_f32(a);
 284             }
 285             static inline EntryType mul(VectorType a) PURE {
 286                 a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
 287                 a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
 288                 return _mm_cvtss_f32(a);
 289             }
 290             static inline EntryType add(VectorType a) PURE {
 291                 a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
 292                 a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
 293                 return _mm_cvtss_f32(a);
 294             }
 295 #undef SUFFIX
 296             static inline VectorType round(VectorType a) PURE {
 297 #if VC_IMPL_SSE4_1
 298                 return _mm_round_ps(a, _MM_FROUND_NINT);
 299 #else
 300                 //XXX slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
 301                 return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
 302 #endif
 303             }
 304         };
 305
 306         template<> struct VectorHelper<float8> {
 307             typedef float EntryType;
 308             typedef M256 VectorType;
 309
 310             static inline VectorType set(const float a) PURE {
 311                 const _M128 x = _mm_set1_ps(a);
 312                 return VectorType::create(x, x);
 313             }
 314             static inline VectorType set(const float a, const float b, const float c, const float d) PURE {
 315                 const _M128 x = _mm_set_ps(a, b, c, d);
 316                 return VectorType::create(x, x);
 317             }
 318             static inline VectorType set(const float a, const float b, const float c, const float d,
 319                     const float e, const float f, const float g, const float h) PURE {
 320                 return VectorType::create(_mm_set_ps(a, b, c, d), _mm_set_ps(e, f, g, h));
 321             }
 322             static inline VectorType zero() PURE { return VectorType::create(_mm_setzero_ps(), _mm_setzero_ps()); }
 323             static inline VectorType one()  PURE { return set(1.f); }
 324
 325 #define REUSE_FLOAT_IMPL1(fun) \
 326             static inline VectorType fun(const VectorType &x) PURE { \
 327                 return VectorType::create(VectorHelper<float>::fun(x[0]), VectorHelper<float>::fun(x[1])); \
 328             }
 329 #define REUSE_FLOAT_IMPL2(fun) \
 330             static inline VectorType fun(const VectorType &x, const VectorType &y) PURE { \
 331                 return VectorType::create(VectorHelper<float>::fun(x[0], y[0]), VectorHelper<float>::fun(x[1], y[1])); \
 332             }
 333 #define REUSE_FLOAT_IMPL3(fun) \
 334             static inline VectorType fun(const VectorType &x, const VectorType &y, const VectorType &z) PURE { \
 335                 return VectorType::create(VectorHelper<float>::fun(x[0], y[0], z[0]), VectorHelper<float>::fun(x[1], y[1], z[1])); \
 336             }
 337             REUSE_FLOAT_IMPL1(reciprocal)
 338             REUSE_FLOAT_IMPL1(sqrt)
 339             REUSE_FLOAT_IMPL1(rsqrt)
 340             REUSE_FLOAT_IMPL1(isNaN)
 341             REUSE_FLOAT_IMPL1(isFinite)
 342             REUSE_FLOAT_IMPL1(abs)
 343             REUSE_FLOAT_IMPL1(round)
 344
 345             REUSE_FLOAT_IMPL2(and_)
 346             REUSE_FLOAT_IMPL2(or_)
 347             REUSE_FLOAT_IMPL2(xor_)
 348             REUSE_FLOAT_IMPL2(notMaskedToZero)
 349             REUSE_FLOAT_IMPL2(add)
 350             REUSE_FLOAT_IMPL2(sub)
 351             REUSE_FLOAT_IMPL2(mul)
 352             REUSE_FLOAT_IMPL2(cmple)
 353             REUSE_FLOAT_IMPL2(cmpnle)
 354             REUSE_FLOAT_IMPL2(cmplt)
 355             REUSE_FLOAT_IMPL2(cmpnlt)
 356             REUSE_FLOAT_IMPL2(cmpeq)
 357             REUSE_FLOAT_IMPL2(cmpneq)
 358             REUSE_FLOAT_IMPL2(min)
 359             REUSE_FLOAT_IMPL2(max)
 360
 361             static inline EntryType min(const VectorType &a) PURE {
 362                 return VectorHelper<float>::min(VectorHelper<float>::min(a[0], a[1]));
 363             }
 364             static inline EntryType max(const VectorType &a) PURE {
 365                 return VectorHelper<float>::max(VectorHelper<float>::max(a[0], a[1]));
 366             }
 367             static inline EntryType mul(const VectorType &a) PURE {
 368                 return VectorHelper<float>::mul(VectorHelper<float>::mul(a[0], a[1]));
 369             }
 370             static inline EntryType add(const VectorType &a) PURE {
 371                 return VectorHelper<float>::add(VectorHelper<float>::add(a[0], a[1]));
 372             }
 373
 374             static inline void multiplyAndAdd(VectorType &a, const VectorType &b, const VectorType &c) {
 375                 VectorHelper<float>::multiplyAndAdd(a[0], b[0], c[0]);
 376                 VectorHelper<float>::multiplyAndAdd(a[1], b[1], c[1]);
 377             }
 378             REUSE_FLOAT_IMPL3(mul)
 379 #undef REUSE_FLOAT_IMPL3
 380 #undef REUSE_FLOAT_IMPL2
 381 #undef REUSE_FLOAT_IMPL1
 382         };
 383
 384         template<> struct VectorHelper<int> {
 385             typedef int EntryType;
 386             typedef _M128I VectorType;
 387 #define SUFFIX si128
 388
 389             OP_(or_) OP_(and_) OP_(xor_)
 390             static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
 391             static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
 392 #undef SUFFIX
 393 #define SUFFIX epi32
 394             static inline VectorType one() PURE { return CAT(_mm_setone_, SUFFIX)(); }
 395
 396             static inline VectorType set(const int a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
 397             static inline VectorType set(const int a, const int b, const int c, const int d) PURE { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }
 398
 399             static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
 400
 401             static inline VectorType shiftLeft(VectorType a, int shift) {
 402                 return CAT(_mm_slli_, SUFFIX)(a, shift);
 403             }
 404             static inline VectorType shiftRight(VectorType a, int shift) {
 405                 return CAT(_mm_srai_, SUFFIX)(a, shift);
 406             }
 407             OP1(abs)
 408
 409             MINMAX
 410             static inline EntryType min(VectorType a) PURE {
 411                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 412                 // using lo_epi16 for speed here
 413                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 414                 return _mm_cvtsi128_si32(a);
 415             }
 416             static inline EntryType max(VectorType a) PURE {
 417                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 418                 // using lo_epi16 for speed here
 419                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 420                 return _mm_cvtsi128_si32(a);
 421             }
 422             static inline EntryType add(VectorType a) PURE {
 423                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 424                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 425                 return _mm_cvtsi128_si32(a);
 426             }
 427 #if VC_IMPL_SSE4_1
 428             static inline VectorType mul(VectorType a, VectorType b) PURE { return _mm_mullo_epi32(a, b); }
 429             static inline EntryType mul(VectorType a) PURE {
 430                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 431                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 432                 return _mm_cvtsi128_si32(a);
 433             }
 434 #else
 435             static inline VectorType mul(const VectorType &a, const VectorType &b) PURE {
 436                 const VectorType &aShift = _mm_srli_si128(a, 4);
 437                 const VectorType &ab02 = _mm_mul_epu32(a, b); // [a0 * b0, a2 * b2]
 438                 const VectorType &bShift = _mm_srli_si128(b, 4);
 439                 const VectorType &ab13 = _mm_mul_epu32(aShift, bShift); // [a1 * b1, a3 * b3]
 440                 return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
 441             }
 442 #endif
 443             static inline VectorType mul(const VectorType a, const VectorType b, _M128 _mask) PURE {
 444                 return _mm_blendv_epi8(a, mul(a, b), _mm_castps_si128(_mask));
 445             }
 446
 447             OP(add) OP(sub)
 448             OPcmp(eq)
 449             OPcmp(lt)
 450             OPcmp(gt)
 451             static inline VectorType cmpneq(const VectorType &a, const VectorType &b) PURE { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 452             static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) PURE { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 453             static inline VectorType cmple (const VectorType &a, const VectorType &b) PURE { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 454             static inline VectorType cmpnle(const VectorType &a, const VectorType &b) PURE { return cmpgt(a, b); }
 455 #undef SUFFIX
 456             static inline VectorType round(VectorType a) PURE { return a; }
 457         };
 458
 459         template<> struct VectorHelper<unsigned int> {
 460             typedef unsigned int EntryType;
 461             typedef _M128I VectorType;
 462 #define SUFFIX si128
 463             OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
 464             static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
 465             static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
 466
 467 #undef SUFFIX
 468 #define SUFFIX epu32
 469             static inline VectorType one() PURE { return CAT(_mm_setone_, SUFFIX)(); }
 470
 471             MINMAX
 472             static inline EntryType min(VectorType a) PURE {
 473                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 474                 // using lo_epi16 for speed here
 475                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 476                 return _mm_cvtsi128_si32(a);
 477             }
 478             static inline EntryType max(VectorType a) PURE {
 479                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 480                 // using lo_epi16 for speed here
 481                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 482                 return _mm_cvtsi128_si32(a);
 483             }
 484             static inline EntryType mul(VectorType a) PURE {
 485                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 486                 // using lo_epi16 for speed here
 487                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 488                 return _mm_cvtsi128_si32(a);
 489             }
 490             static inline EntryType add(VectorType a) PURE {
 491                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 492                 // using lo_epi16 for speed here
 493                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 494                 return _mm_cvtsi128_si32(a);
 495             }
 496
 497             static inline VectorType mul(const VectorType a, const VectorType b, _M128 _mask) PURE {
 498                 return _mm_blendv_epi8(a, mul(a, b), _mm_castps_si128(_mask));
 499             }
 500             static inline VectorType mul(const VectorType &a, const VectorType &b) PURE {
 501                 return VectorHelper<int>::mul(a, b);
 502             }
 503 //X             template<unsigned int b> static inline VectorType mul(const VectorType a) PURE {
 504 //X                 switch (b) {
 505 //X                     case    0: return zero();
 506 //X                     case    1: return a;
 507 //X                     case    2: return _mm_slli_epi32(a,  1);
 508 //X                     case    4: return _mm_slli_epi32(a,  2);
 509 //X                     case    8: return _mm_slli_epi32(a,  3);
 510 //X                     case   16: return _mm_slli_epi32(a,  4);
 511 //X                     case   32: return _mm_slli_epi32(a,  5);
 512 //X                     case   64: return _mm_slli_epi32(a,  6);
 513 //X                     case  128: return _mm_slli_epi32(a,  7);
 514 //X                     case  256: return _mm_slli_epi32(a,  8);
 515 //X                     case  512: return _mm_slli_epi32(a,  9);
 516 //X                     case 1024: return _mm_slli_epi32(a, 10);
 517 //X                     case 2048: return _mm_slli_epi32(a, 11);
 518 //X                 }
 519 //X                 return mul(a, set(b));
 520 //X             }
 521
 522 #undef SUFFIX
 523 #define SUFFIX epi32
 524             static inline VectorType shiftLeft(VectorType a, int shift) {
 525                 return CAT(_mm_slli_, SUFFIX)(a, shift);
 526             }
 527             static inline VectorType shiftRight(VectorType a, int shift) {
 528                 return CAT(_mm_srli_, SUFFIX)(a, shift);
 529             }
 530             static inline VectorType set(const unsigned int a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
 531             static inline VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) PURE { return CAT(_mm_set_, SUFFIX)(a, b, c, d); }
 532
 533             OP(add) OP(sub)
 534             OPcmp(eq)
 535             static inline VectorType cmpneq(const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
 536
 537 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
 538             static inline VectorType cmplt(const VectorType &a, const VectorType &b) PURE {
 539                 return _mm_cmplt_epu32(a, b);
 540             }
 541             static inline VectorType cmpgt(const VectorType &a, const VectorType &b) PURE {
 542                 return _mm_cmpgt_epu32(a, b);
 543             }
 544 #else
 545             OPcmp(lt)
 546             OPcmp(gt)
 547 #endif
 548             static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
 549             static inline VectorType cmple (const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
 550             static inline VectorType cmpnle(const VectorType &a, const VectorType &b) PURE { return cmpgt(a, b); }
 551
 552 #undef SUFFIX
 553             static inline VectorType round(VectorType a) PURE { return a; }
 554         };
 555
 556         template<> struct VectorHelper<signed short> {
 557             typedef _M128I VectorType;
 558             typedef signed short EntryType;
 559 #define SUFFIX si128
 560
 561             OP_(or_) OP_(and_) OP_(xor_)
 562             static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
 563             static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
 564             static inline _M128I concat(_M128I a, _M128I b) PURE { return _mm_packs_epi32(a, b); }
 565             static inline _M128I expand0(_M128I x) PURE { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }
 566             static inline _M128I expand1(_M128I x) PURE { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }
 567
 568 #undef SUFFIX
 569 #define SUFFIX epi16
 570             static inline VectorType one() PURE { return CAT(_mm_setone_, SUFFIX)(); }
 571
 572             static inline VectorType shiftLeft(VectorType a, int shift) {
 573                 return CAT(_mm_slli_, SUFFIX)(a, shift);
 574             }
 575             static inline VectorType shiftRight(VectorType a, int shift) {
 576                 return CAT(_mm_srai_, SUFFIX)(a, shift);
 577             }
 578             static inline VectorType set(const EntryType a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
 579             static inline VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
 580                     const EntryType e, const EntryType f, const EntryType g, const EntryType h) PURE {
 581                 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
 582             }
 583
 584             static inline void multiplyAndAdd(VectorType &v1, VectorType v2, VectorType v3) {
 585                 v1 = add(mul(v1, v2), v3); }
 586
 587             OP1(abs)
 588
 589             static inline VectorType mul(VectorType a, VectorType b, _M128 _mask) PURE {
 590                 _M128I mask = _mm_castps_si128(_mask);
 591                 return _mm_or_si128(
 592                     _mm_and_si128(mask, mul(a, b)),
 593                     _mm_andnot_si128(mask, a)
 594                     );
 595             }
 596             OPx(mul, mullo)
 597             OP(min) OP(max)
 598             static inline EntryType min(VectorType a) PURE {
 599                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 600                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 601                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 602                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 603                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 604             }
 605             static inline EntryType max(VectorType a) PURE {
 606                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 607                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 608                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 609                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 610                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 611             }
 612             static inline EntryType mul(VectorType a) PURE {
 613                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 614                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 615                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 616                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 617             }
 618             static inline EntryType add(VectorType a) PURE {
 619                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 620                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 621                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 622                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 623             }
 624
 625             OP(add) OP(sub)
 626             OPcmp(eq)
 627             OPcmp(lt)
 628             OPcmp(gt)
 629             static inline VectorType cmpneq(const VectorType &a, const VectorType &b) PURE { _M128I x = cmpeq(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 630             static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) PURE { _M128I x = cmplt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 631             static inline VectorType cmple (const VectorType &a, const VectorType &b) PURE { _M128I x = cmpgt(a, b); return _mm_andnot_si128(x, _mm_setallone_si128()); }
 632             static inline VectorType cmpnle(const VectorType &a, const VectorType &b) PURE { return cmpgt(a, b); }
 633 #undef SUFFIX
 634             static inline VectorType round(VectorType a) PURE { return a; }
 635         };
 636
 637         template<> struct VectorHelper<unsigned short> {
 638             typedef _M128I VectorType;
 639             typedef unsigned short EntryType;
 640 #define SUFFIX si128
 641             OP_CAST_(or_) OP_CAST_(and_) OP_CAST_(xor_)
 642             static inline VectorType zero() PURE { return CAT(_mm_setzero_, SUFFIX)(); }
 643             static inline VectorType notMaskedToZero(VectorType a, _M128 mask) PURE { return CAT(_mm_and_, SUFFIX)(_mm_castps_si128(mask), a); }
 644 #if VC_IMPL_SSE4_1
 645             static inline _M128I concat(_M128I a, _M128I b) PURE { return _mm_packus_epi32(a, b); }
 646 #else
 647             // XXX too bad, but this is broken without SSE 4.1
 648             static inline _M128I concat(_M128I a, _M128I b) PURE { return _mm_packs_epi32(a, b); }
 649 #endif
 650             static inline _M128I expand0(_M128I x) PURE { return _mm_srli_epi32(_mm_unpacklo_epi16(x, x), 16); }
 651             static inline _M128I expand1(_M128I x) PURE { return _mm_srli_epi32(_mm_unpackhi_epi16(x, x), 16); }
 652
 653 #undef SUFFIX
 654 #define SUFFIX epu16
 655             static inline VectorType one() PURE { return CAT(_mm_setone_, SUFFIX)(); }
 656
 657             static inline VectorType mul(VectorType a, VectorType b, _M128 _mask) PURE {
 658                 _M128I mask = _mm_castps_si128(_mask);
 659                 return _mm_or_si128(
 660                     _mm_and_si128(mask, mul(a, b)),
 661                     _mm_andnot_si128(mask, a)
 662                     );
 663             }
 664 //X             template<unsigned int b> static inline VectorType mul(const VectorType a) PURE {
 665 //X                 switch (b) {
 666 //X                     case    0: return zero();
 667 //X                     case    1: return a;
 668 //X                     case    2: return _mm_slli_epi16(a,  1);
 669 //X                     case    4: return _mm_slli_epi16(a,  2);
 670 //X                     case    8: return _mm_slli_epi16(a,  3);
 671 //X                     case   16: return _mm_slli_epi16(a,  4);
 672 //X                     case   32: return _mm_slli_epi16(a,  5);
 673 //X                     case   64: return _mm_slli_epi16(a,  6);
 674 //X                     case  128: return _mm_slli_epi16(a,  7);
 675 //X                     case  256: return _mm_slli_epi16(a,  8);
 676 //X                     case  512: return _mm_slli_epi16(a,  9);
 677 //X                     case 1024: return _mm_slli_epi16(a, 10);
 678 //X                     case 2048: return _mm_slli_epi16(a, 11);
 679 //X                 }
 680 //X                 return mul(a, set(b));
 681 //X             }
 682 #if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || VC_IMPL_SSE4_1
 683             OP(min) OP(max)
 684 #endif
 685 #undef SUFFIX
 686 #define SUFFIX epi16
 687             static inline VectorType shiftLeft(VectorType a, int shift) {
 688                 return CAT(_mm_slli_, SUFFIX)(a, shift);
 689             }
 690             static inline VectorType shiftRight(VectorType a, int shift) {
 691                 return CAT(_mm_srli_, SUFFIX)(a, shift);
 692             }
 693             OPx(mul, mullo) // should work correctly for all values
 694 #if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(VC_IMPL_SSE4_1)
 695             OP(min) OP(max) // XXX breaks for values with MSB set
 696 #endif
 697             static inline EntryType min(VectorType a) PURE {
 698                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 699                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 700                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 701                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 702                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 703             }
 704             static inline EntryType max(VectorType a) PURE {
 705                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 706                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 707                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 708                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 709                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 710             }
 711             static inline EntryType mul(VectorType a) PURE {
 712                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 713                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 714                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 715                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 716                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 717             }
 718             static inline EntryType add(VectorType a) PURE {
 719                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
 720                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
 721                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
 722                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
 723                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
 724             }
 725             static inline VectorType set(const EntryType a) PURE { return CAT(_mm_set1_, SUFFIX)(a); }
 726             static inline VectorType set(const EntryType a, const EntryType b, const EntryType c,
 727                     const EntryType d, const EntryType e, const EntryType f,
 728                     const EntryType g, const EntryType h) PURE {
 729                 return CAT(_mm_set_, SUFFIX)(a, b, c, d, e, f, g, h);
 730             }
 731
 732             OP(add) OP(sub)
 733             OPcmp(eq)
 734             static inline VectorType cmpneq(const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmpeq(a, b), _mm_setallone_si128()); }
 735
 736 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
 737             static inline VectorType cmplt(const VectorType &a, const VectorType &b) PURE {
 738                 return _mm_cmplt_epu16(a, b);
 739             }
 740             static inline VectorType cmpgt(const VectorType &a, const VectorType &b) PURE {
 741                 return _mm_cmpgt_epu16(a, b);
 742             }
 743 #else
 744             OPcmp(lt)
 745             OPcmp(gt)
 746 #endif
 747             static inline VectorType cmpnlt(const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmplt(a, b), _mm_setallone_si128()); }
 748             static inline VectorType cmple (const VectorType &a, const VectorType &b) PURE { return _mm_andnot_si128(cmpgt(a, b), _mm_setallone_si128()); }
 749             static inline VectorType cmpnle(const VectorType &a, const VectorType &b) PURE { return cmpgt(a, b); }
 750 #undef SUFFIX
 751             static inline VectorType round(VectorType a) PURE { return a; }
 752         };
 753 #undef OP1
 754 #undef OP
 755 #undef OP_
 756 #undef OPx
 757 #undef OPcmp
 758
 759 } // namespace SSE
 760 } // namespace Vc
 761
 762 #include "vectorhelper.tcc"
 763
 764 #endif // SSE_VECTORHELPER_H