update to Vc 0.7.3-dev

[u/mrichter/AliRoot.git] / Vc / include / Vc / avx / shuffle.h
diff --git a/Vc/include/Vc/avx/shuffle.h b/Vc/include/Vc/avx/shuffle.h

index b7e82873a7493b18b3b13bccbf86044dc350ced9..9bc56b5d7532cc9303d0aed9aae06742417dd7b1 100644 (file)
--- a/Vc/include/Vc/avx/shuffle.h
+++ b/Vc/include/Vc/avx/shuffle.h
@@ -21,51 +21,80 @@
  #define VC_AVX_SHUFFLE_H
  
  #include "../sse/shuffle.h"
+#include "macros.h"
  
+namespace AliRoot {
  namespace Vc
  {
+        using AVX::m128;
+        using AVX::m128d;
+        using AVX::m128i;
+        using AVX::m256;
+        using AVX::m256d;
+        using AVX::m256i;
+        using AVX::param128;
+        using AVX::param128d;
+        using AVX::param128i;
+        using AVX::param256;
+        using AVX::param256d;
+        using AVX::param256i;
      namespace Mem
      {
-        template<VecPos L, VecPos H> static inline __m256 ALWAYS_INLINE CONST shuffle128(__m256 x, __m256 y) {
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x) {
+            VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
+            VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
+            return _mm256_permute2f128_ps(x, x, L + H * (1 << 4));
+        }
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x) {
+            VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
+            VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
+            return _mm256_permute2f128_pd(x, x, L + H * (1 << 4));
+        }
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x) {
+            VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
+            VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
+            return _mm256_permute2f128_si256(x, x, L + H * (1 << 4));
+        }
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle128(param256 x, param256 y) {
              VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
              VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
              return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
          }
-        template<VecPos L, VecPos H> static inline __m256i ALWAYS_INLINE CONST shuffle128(__m256i x, __m256i y) {
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256i Vc_CONST shuffle128(param256i x, param256i y) {
              VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
              VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
              return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
          }
-        template<VecPos L, VecPos H> static inline __m256d ALWAYS_INLINE CONST shuffle128(__m256d x, __m256d y) {
+        template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle128(param256d x, param256d y) {
              VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
              VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
              return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
          }
-        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static inline __m256d ALWAYS_INLINE CONST permute(__m256d x) {
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) {
              VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range);
              VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
              return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
          }
-        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static inline __m256 ALWAYS_INLINE CONST permute(__m256 x) {
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) {
              VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
              VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
              return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
          }
-        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static inline __m256i ALWAYS_INLINE CONST permute(__m256i x) {
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256i Vc_CONST permute(param256i x) {
              return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
          }
-        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static inline __m256d ALWAYS_INLINE CONST shuffle(__m256d x, __m256d y) {
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) {
              VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range);
              VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range);
              return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
          }
-        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static inline __m256 ALWAYS_INLINE CONST shuffle(__m256 x, __m256 y) {
+        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) {
              VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
              VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
              return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
          }
          template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
-        static inline __m256 ALWAYS_INLINE CONST blend(__m256 x, __m256 y) {
+        static Vc_ALWAYS_INLINE m256 Vc_CONST blend(param256 x, param256 y) {
              VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
              VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
              VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
@@ -82,12 +111,12 @@ namespace Vc
                      );
          }
          template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
-        static inline __m256i ALWAYS_INLINE CONST blend(__m256i x, __m256i y) {
+        static Vc_ALWAYS_INLINE m256i Vc_CONST blend(param256i x, param256i y) {
              return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
          }
          template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
          template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
-        static inline __m256 ALWAYS_INLINE CONST permute(__m256 x) {
+        static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) {
              VC_STATIC_ASSERT(Dst0 >= X0 && Dst0 <= X7, Incorrect_Range);
              VC_STATIC_ASSERT(Dst1 >= X0 && Dst1 <= X7, Incorrect_Range);
              VC_STATIC_ASSERT(Dst2 >= X0 && Dst2 <= X7, Incorrect_Range);
@@ -99,9 +128,9 @@ namespace Vc
              if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
                  return permute<Dst0, Dst1, Dst2, Dst3>(x);
              }
-            const __m128 loIn = _mm256_castps256_ps128(x);
-            const __m128 hiIn = _mm256_extractf128_ps(x, 1);
-            __m128 lo, hi;
+            const m128 loIn = _mm256_castps256_ps128(x);
+            const m128 hiIn = _mm256_extractf128_ps(x, 1);
+            m128 lo, hi;
  
              if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
                  lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
@@ -157,52 +186,54 @@ namespace Vc
      // The shuffles and permutes above use memory ordering. The ones below use register ordering:
      namespace Reg
      {
-        template<VecPos H, VecPos L> static inline __m256 ALWAYS_INLINE CONST permute128(__m256 x, __m256 y) {
+        template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x, param256 y) {
              VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
              VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
              return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
          }
-        template<VecPos H, VecPos L> static inline __m256i ALWAYS_INLINE CONST permute128(__m256i x, __m256i y) {
+        template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x, param256i y) {
              VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
              VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
              return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
          }
-        template<VecPos H, VecPos L> static inline __m256d ALWAYS_INLINE CONST permute128(__m256d x, __m256d y) {
+        template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x, param256d y) {
              VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
              VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
              return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
          }
-        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static inline __m256d ALWAYS_INLINE CONST permute(__m256d x) {
+        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) {
              VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range);
              VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
              return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
          }
-        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static inline __m256 ALWAYS_INLINE CONST permute(__m256 x) {
+        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) {
              VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
              VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
              return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
          }
-        template<VecPos Dst1, VecPos Dst0> static inline __m128d ALWAYS_INLINE CONST permute(__m128d x) {
+        template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m128d Vc_CONST permute(param128d x) {
              VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0, Incorrect_Range);
              VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1, Incorrect_Range);
              return _mm_permute_pd(x, Dst0 + Dst1 * 2);
          }
-        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static inline __m128 ALWAYS_INLINE CONST permute(__m128 x) {
+        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m128 Vc_CONST permute(param128 x) {
              VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
              VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
              return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
          }
-        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static inline __m256d ALWAYS_INLINE CONST shuffle(__m256d x, __m256d y) {
+        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) {
              VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range);
              VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range);
              return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
          }
-        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static inline __m256 ALWAYS_INLINE CONST shuffle(__m256 x, __m256 y) {
+        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) {
              VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
              VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
              return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
          }
      } // namespace Reg
  } // namespace Vc
+} // namespace AliRoot
+#include "undomacros.h"
  
  #endif // VC_AVX_SHUFFLE_H