]>
Commit | Line | Data |
---|---|---|
f22341db | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
20 | #ifndef VC_SSE_SHUFFLE_H | |
21 | #define VC_SSE_SHUFFLE_H | |
22 | ||
c017a39f | 23 | #include "macros.h" |
24 | ||
25 | namespace AliRoot { | |
f22341db | 26 | namespace Vc |
27 | { | |
28 | enum VecPos { | |
29 | X0, X1, X2, X3, X4, X5, X6, X7, | |
30 | Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 | |
31 | }; | |
32 | ||
33 | namespace Mem | |
34 | { | |
35 | // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2] | |
c017a39f | 36 | template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) { |
f22341db | 37 | VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); |
38 | VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); | |
39 | return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64); | |
40 | } | |
41 | ||
42 | // shuffle<X1, Y0>([x0 x1], [y0 y1]) = [x1 y0] | |
c017a39f | 43 | template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) { |
f22341db | 44 | VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0, Incorrect_Range); |
45 | VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1, Incorrect_Range); | |
46 | return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2); | |
47 | } | |
48 | ||
49 | // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1] | |
c017a39f | 50 | template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) { |
f22341db | 51 | VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); |
52 | VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); | |
53 | #if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX) | |
54 | using Vc::SSE::_mm_blend_pd; | |
55 | #endif | |
56 | return _mm_blend_pd(x, y, (Dst0 / Y0) + (Dst1 / Y0) * 2); | |
57 | } | |
58 | ||
59 | // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1] | |
c017a39f | 60 | template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) { |
f22341db | 61 | VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); |
62 | VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); | |
63 | VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range); | |
64 | VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range); | |
65 | #if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX) | |
66 | using Vc::SSE::_mm_blend_ps; | |
67 | #endif | |
68 | return _mm_blend_ps(x, y, | |
69 | (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + | |
70 | (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8); | |
71 | } | |
72 | ||
73 | template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7> | |
c017a39f | 74 | static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) { |
f22341db | 75 | VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range); |
76 | VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range); | |
77 | VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range); | |
78 | VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range); | |
79 | VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range); | |
80 | VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range); | |
81 | VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range); | |
82 | VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range); | |
83 | #if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX) | |
84 | using Vc::SSE::_mm_blend_epi16; | |
85 | #endif | |
86 | return _mm_blend_epi16(x, y, | |
87 | (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + | |
88 | (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 + | |
89 | (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + | |
90 | (Dst6 / Y6) * 64 + (Dst7 / Y7) *128 | |
91 | ); | |
92 | } | |
93 | ||
94 | // permute<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2] | |
c017a39f | 95 | template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) { |
f22341db | 96 | VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); |
97 | VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); | |
98 | return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); | |
99 | } | |
100 | ||
c017a39f | 101 | template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { |
f22341db | 102 | VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); |
103 | VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); | |
104 | return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); | |
105 | } | |
106 | ||
c017a39f | 107 | template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) { |
f22341db | 108 | VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); |
109 | VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); | |
110 | return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); | |
111 | } | |
112 | ||
c017a39f | 113 | template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) { |
f22341db | 114 | VC_STATIC_ASSERT(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, Incorrect_Range); |
115 | VC_STATIC_ASSERT(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, Incorrect_Range); | |
116 | return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64); | |
117 | } | |
118 | ||
119 | template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7> | |
c017a39f | 120 | static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { |
f22341db | 121 | VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); |
122 | VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); | |
123 | VC_STATIC_ASSERT(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, Incorrect_Range); | |
124 | VC_STATIC_ASSERT(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, Incorrect_Range); | |
125 | if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) { | |
126 | x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); | |
127 | } | |
128 | if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) { | |
129 | x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64); | |
130 | } | |
131 | return x; | |
132 | } | |
133 | } // namespace Mem | |
134 | // The shuffles and permutes above use memory ordering. The ones below use register ordering: | |
135 | namespace Reg | |
136 | { | |
137 | // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1] | |
c017a39f | 138 | template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) { |
f22341db | 139 | return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y); |
140 | } | |
141 | ||
142 | // shuffle<Y0, X1>([x1 x0], [y1 y0]) = [y0 x1] | |
c017a39f | 143 | template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) { |
f22341db | 144 | return Mem::shuffle<Dst0, Dst1>(x, y); |
145 | } | |
146 | ||
147 | // shuffle<X3, X0, X2, X1>([x3 x2 x1 x0]) = [x3 x0 x2 x1] | |
c017a39f | 148 | template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) { |
f22341db | 149 | VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range); |
150 | VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range); | |
151 | return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64); | |
152 | } | |
153 | ||
154 | // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1] | |
c017a39f | 155 | template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) { |
f22341db | 156 | VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range); |
157 | VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range); | |
158 | return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64)); | |
159 | } | |
160 | ||
161 | // blend<Y1, X0>([x1 x0], [y1, y0]) = [x1 y0] | |
c017a39f | 162 | template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) { |
f22341db | 163 | return Mem::blend<Dst0, Dst1>(x, y); |
164 | } | |
165 | ||
c017a39f | 166 | template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) { |
f22341db | 167 | return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y); |
168 | } | |
169 | } // namespace Reg | |
170 | } // namespace Vc | |
c017a39f | 171 | } // namespace AliRoot |
172 | ||
173 | #include "undomacros.h" | |
f22341db | 174 | |
175 | #endif // VC_SSE_SHUFFLE_H |