Vc/include/Vc/avx/shuffle.h

   1 /*  This file is part of the Vc library.
   2
   3     Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>
   4
   5     Vc is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU Lesser General Public License as
   7     published by the Free Software Foundation, either version 3 of
   8     the License, or (at your option) any later version.
   9
  10     Vc is distributed in the hope that it will be useful, but
  11     WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with Vc.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 */
  19
  20 #ifndef VC_AVX_SHUFFLE_H
  21 #define VC_AVX_SHUFFLE_H
  22
  23 #include "../sse/shuffle.h"
  24 #include "macros.h"
  25
  26 namespace AliRoot {
  27 namespace Vc
  28 {
  29         using AVX::m128;
  30         using AVX::m128d;
  31         using AVX::m128i;
  32         using AVX::m256;
  33         using AVX::m256d;
  34         using AVX::m256i;
  35         using AVX::param128;
  36         using AVX::param128d;
  37         using AVX::param128i;
  38         using AVX::param256;
  39         using AVX::param256d;
  40         using AVX::param256i;
  41     namespace Mem
  42     {
  43         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x) {
  44             VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
  45             VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
  46             return _mm256_permute2f128_ps(x, x, L + H * (1 << 4));
  47         }
  48         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x) {
  49             VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
  50             VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
  51             return _mm256_permute2f128_pd(x, x, L + H * (1 << 4));
  52         }
  53         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x) {
  54             VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
  55             VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
  56             return _mm256_permute2f128_si256(x, x, L + H * (1 << 4));
  57         }
  58         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle128(param256 x, param256 y) {
  59             VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
  60             VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
  61             return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
  62         }
  63         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256i Vc_CONST shuffle128(param256i x, param256i y) {
  64             VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
  65             VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
  66             return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
  67         }
  68         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle128(param256d x, param256d y) {
  69             VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
  70             VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
  71             return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
  72         }
  73         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) {
  74             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range);
  75             VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
  76             return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
  77         }
  78         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) {
  79             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
  80             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
  81             return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  82         }
  83         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256i Vc_CONST permute(param256i x) {
  84             return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
  85         }
  86         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) {
  87             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range);
  88             VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range);
  89             return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
  90         }
  91         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) {
  92             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
  93             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
  94             return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
  95         }
  96         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
  97         static Vc_ALWAYS_INLINE m256 Vc_CONST blend(param256 x, param256 y) {
  98             VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
  99             VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
 100             VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
 101             VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range);
 102             VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range);
 103             VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range);
 104             VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range);
 105             VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range);
 106             return _mm256_blend_ps(x, y,
 107                     (Dst0 / Y0) *  1 + (Dst1 / Y1) *  2 +
 108                     (Dst2 / Y2) *  4 + (Dst3 / Y3) *  8 +
 109                     (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
 110                     (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
 111                     );
 112         }
 113         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
 114         static Vc_ALWAYS_INLINE m256i Vc_CONST blend(param256i x, param256i y) {
 115             return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
 116         }
 117         template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
 118         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
 119         static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) {
 120             VC_STATIC_ASSERT(Dst0 >= X0 && Dst0 <= X7, Incorrect_Range);
 121             VC_STATIC_ASSERT(Dst1 >= X0 && Dst1 <= X7, Incorrect_Range);
 122             VC_STATIC_ASSERT(Dst2 >= X0 && Dst2 <= X7, Incorrect_Range);
 123             VC_STATIC_ASSERT(Dst3 >= X0 && Dst3 <= X7, Incorrect_Range);
 124             VC_STATIC_ASSERT(Dst4 >= X0 && Dst4 <= X7, Incorrect_Range);
 125             VC_STATIC_ASSERT(Dst5 >= X0 && Dst5 <= X7, Incorrect_Range);
 126             VC_STATIC_ASSERT(Dst6 >= X0 && Dst6 <= X7, Incorrect_Range);
 127             VC_STATIC_ASSERT(Dst7 >= X0 && Dst7 <= X7, Incorrect_Range);
 128             if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
 129                 return permute<Dst0, Dst1, Dst2, Dst3>(x);
 130             }
 131             const m128 loIn = _mm256_castps256_ps128(x);
 132             const m128 hiIn = _mm256_extractf128_ps(x, 1);
 133             m128 lo, hi;
 134
 135             if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
 136                 lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
 137             } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) {
 138                 lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
 139             } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) {
 140                 lo = shuffle<Dst0, Dst1, Dst2 - X4 + Y0, Dst3 - X4 + Y0>(loIn, hiIn);
 141             } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) {
 142                 lo = shuffle<Dst0 - X4, Dst1 - X4, Dst2 + Y0, Dst3 + Y0>(hiIn, loIn);
 143             } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) {
 144                 lo = _mm_unpacklo_ps(loIn, hiIn);
 145             } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) {
 146                 lo = _mm_unpacklo_ps(hiIn, loIn);
 147             } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) {
 148                 lo = _mm_unpackhi_ps(loIn, hiIn);
 149             } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) {
 150                 lo = _mm_unpackhi_ps(hiIn, loIn);
 151             } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) {
 152                 lo = blend<ScaleForBlend<Dst0>::Value, ScaleForBlend<Dst1>::Value,
 153                    ScaleForBlend<Dst2>::Value, ScaleForBlend<Dst3>::Value>(loIn, hiIn);
 154             }
 155
 156             if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) {
 157                 hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
 158             } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) {
 159                 hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
 160             } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) {
 161                 hi = shuffle<Dst4, Dst5, Dst6 - X4 + Y0, Dst7 - X4 + Y0>(loIn, hiIn);
 162             } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) {
 163                 hi = shuffle<Dst4 - X4, Dst5 - X4, Dst6 + Y0, Dst7 + Y0>(hiIn, loIn);
 164             } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) {
 165                 hi = _mm_unpacklo_ps(loIn, hiIn);
 166             } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) {
 167                 hi = _mm_unpacklo_ps(hiIn, loIn);
 168             } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) {
 169                 hi = _mm_unpackhi_ps(loIn, hiIn);
 170             } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) {
 171                 hi = _mm_unpackhi_ps(hiIn, loIn);
 172             } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) {
 173                 hi = blend<ScaleForBlend<Dst4>::Value, ScaleForBlend<Dst5>::Value,
 174                    ScaleForBlend<Dst6>::Value, ScaleForBlend<Dst7>::Value>(loIn, hiIn);
 175             }
 176
 177             return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
 178         }
 179     } // namespace Mem
 180
 181     // little endian has the lo bits on the right and high bits on the left
 182     // with vectors this becomes greatly confusing:
 183     // Mem: abcd
 184     // Reg: dcba
 185     //
 186     // The shuffles and permutes above use memory ordering. The ones below use register ordering:
 187     namespace Reg
 188     {
 189         template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x, param256 y) {
 190             VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
 191             VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
 192             return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
 193         }
 194         template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x, param256i y) {
 195             VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
 196             VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
 197             return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
 198         }
 199         template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x, param256d y) {
 200             VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
 201             VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
 202             return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
 203         }
 204         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) {
 205             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range);
 206             VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
 207             return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
 208         }
 209         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) {
 210             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
 211             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
 212             return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
 213         }
 214         template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m128d Vc_CONST permute(param128d x) {
 215             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0, Incorrect_Range);
 216             VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1, Incorrect_Range);
 217             return _mm_permute_pd(x, Dst0 + Dst1 * 2);
 218         }
 219         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m128 Vc_CONST permute(param128 x) {
 220             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
 221             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
 222             return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
 223         }
 224         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) {
 225             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range);
 226             VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range);
 227             return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
 228         }
 229         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) {
 230             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
 231             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
 232             return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
 233         }
 234     } // namespace Reg
 235 } // namespace Vc
 236 } // namespace AliRoot
 237 #include "undomacros.h"
 238
 239 #endif // VC_AVX_SHUFFLE_H