Vc/include/Vc/sse/shuffle.h

   1 /*  This file is part of the Vc library.
   2
   3     Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>
   4
   5     Vc is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU Lesser General Public License as
   7     published by the Free Software Foundation, either version 3 of
   8     the License, or (at your option) any later version.
   9
  10     Vc is distributed in the hope that it will be useful, but
  11     WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with Vc.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 */
  19
  20 #ifndef VC_SSE_SHUFFLE_H
  21 #define VC_SSE_SHUFFLE_H
  22
  23 namespace Vc
  24 {
  25     enum VecPos {
  26         X0, X1, X2, X3, X4, X5, X6, X7,
  27         Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
  28     };
  29
  30     namespace Mem
  31     {
  32         // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
  33         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static inline __m128 ALWAYS_INLINE CONST shuffle(__m128 x, __m128 y) {
  34             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
  35             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
  36             return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
  37         }
  38
  39         // shuffle<X1, Y0>([x0 x1], [y0 y1]) = [x1 y0]
  40         template<VecPos Dst0, VecPos Dst1> static inline __m128d ALWAYS_INLINE CONST shuffle(__m128d x, __m128d y) {
  41             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0, Incorrect_Range);
  42             VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1, Incorrect_Range);
  43             return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
  44         }
  45
  46         // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
  47         template<VecPos Dst0, VecPos Dst1> static inline __m128d ALWAYS_INLINE CONST blend(__m128d x, __m128d y) {
  48             VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
  49             VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
  50 #if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
  51             using Vc::SSE::_mm_blend_pd;
  52 #endif
  53             return _mm_blend_pd(x, y, (Dst0 / Y0) + (Dst1 / Y0) * 2);
  54         }
  55
  56         // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
  57         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static inline __m128 ALWAYS_INLINE CONST blend(__m128 x, __m128 y) {
  58             VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
  59             VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
  60             VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
  61             VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range);
  62 #if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
  63             using Vc::SSE::_mm_blend_ps;
  64 #endif
  65             return _mm_blend_ps(x, y,
  66                     (Dst0 / Y0) *  1 + (Dst1 / Y1) *  2 +
  67                     (Dst2 / Y2) *  4 + (Dst3 / Y3) *  8);
  68         }
  69
  70         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
  71         static inline __m128i ALWAYS_INLINE CONST blend(__m128i x, __m128i y) {
  72             VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
  73             VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
  74             VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
  75             VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range);
  76             VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range);
  77             VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range);
  78             VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range);
  79             VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range);
  80 #if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
  81             using Vc::SSE::_mm_blend_epi16;
  82 #endif
  83             return _mm_blend_epi16(x, y,
  84                     (Dst0 / Y0) *  1 + (Dst1 / Y1) *  2 +
  85                     (Dst2 / Y2) *  4 + (Dst3 / Y3) *  8 +
  86                     (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
  87                     (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
  88                     );
  89         }
  90
  91         // permute<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
  92         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static inline __m128 ALWAYS_INLINE CONST permute(__m128 x) {
  93             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
  94             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
  95             return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
  96         }
  97
  98         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static inline __m128i ALWAYS_INLINE CONST permute(__m128i x) {
  99             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
 100             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
 101             return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
 102         }
 103
 104         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static inline __m128i ALWAYS_INLINE CONST permuteLo(__m128i x) {
 105             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
 106             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
 107             return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
 108         }
 109
 110         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static inline __m128i ALWAYS_INLINE CONST permuteHi(__m128i x) {
 111             VC_STATIC_ASSERT(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, Incorrect_Range);
 112             VC_STATIC_ASSERT(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, Incorrect_Range);
 113             return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
 114         }
 115
 116         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
 117             static inline __m128i ALWAYS_INLINE CONST permute(__m128i x) {
 118             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
 119             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
 120             VC_STATIC_ASSERT(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, Incorrect_Range);
 121             VC_STATIC_ASSERT(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, Incorrect_Range);
 122             if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) {
 123                 x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
 124             }
 125             if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) {
 126                 x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
 127             }
 128             return x;
 129         }
 130     } // namespace Mem
 131     // The shuffles and permutes above use memory ordering. The ones below use register ordering:
 132     namespace Reg
 133     {
 134         // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
 135         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static inline __m128 ALWAYS_INLINE CONST shuffle(__m128 x, __m128 y) {
 136             return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
 137         }
 138
 139         // shuffle<Y0, X1>([x1 x0], [y1 y0]) = [y0 x1]
 140         template<VecPos Dst1, VecPos Dst0> static inline __m128d ALWAYS_INLINE CONST shuffle(__m128d x, __m128d y) {
 141             return Mem::shuffle<Dst0, Dst1>(x, y);
 142         }
 143
 144         // shuffle<X3, X0, X2, X1>([x3 x2 x1 x0]) = [x3 x0 x2 x1]
 145         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static inline __m128i ALWAYS_INLINE CONST permute(__m128i x) {
 146             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
 147             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
 148             return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
 149         }
 150
 151         // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
 152         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static inline __m128i ALWAYS_INLINE CONST shuffle(__m128i x, __m128i y) {
 153             VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
 154             VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
 155             return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
 156         }
 157
 158         // blend<Y1, X0>([x1 x0], [y1, y0]) = [x1 y0]
 159         template<VecPos Dst1, VecPos Dst0> static inline __m128d ALWAYS_INLINE CONST blend(__m128d x, __m128d y) {
 160             return Mem::blend<Dst0, Dst1>(x, y);
 161         }
 162
 163         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static inline __m128 ALWAYS_INLINE CONST blend(__m128 x, __m128 y) {
 164             return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
 165         }
 166     } // namespace Reg
 167 } // namespace Vc
 168
 169 #endif // VC_SSE_SHUFFLE_H