1 /* This file is part of the Vc library.
3 Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
20 #ifndef VC_AVX_SHUFFLE_H
21 #define VC_AVX_SHUFFLE_H
23 #include "../sse/shuffle.h"
43 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x) {
44 VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
45 VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
46 return _mm256_permute2f128_ps(x, x, L + H * (1 << 4));
48 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x) {
49 VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
50 VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
51 return _mm256_permute2f128_pd(x, x, L + H * (1 << 4));
53 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x) {
54 VC_STATIC_ASSERT(L >= X0 && L <= X1, Incorrect_Range);
55 VC_STATIC_ASSERT(H >= X0 && H <= X1, Incorrect_Range);
56 return _mm256_permute2f128_si256(x, x, L + H * (1 << 4));
58 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle128(param256 x, param256 y) {
59 VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
60 VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
61 return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
63 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256i Vc_CONST shuffle128(param256i x, param256i y) {
64 VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
65 VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
66 return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
68 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle128(param256d x, param256d y) {
69 VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
70 VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
71 return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
73 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) {
74 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range);
75 VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
76 return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
78 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) {
79 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
80 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
81 return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
83 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256i Vc_CONST permute(param256i x) {
84 return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
86 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) {
87 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range);
88 VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range);
89 return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
91 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) {
92 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
93 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
94 return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
96 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
97 static Vc_ALWAYS_INLINE m256 Vc_CONST blend(param256 x, param256 y) {
98 VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
99 VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
100 VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
101 VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range);
102 VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range);
103 VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range);
104 VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range);
105 VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range);
106 return _mm256_blend_ps(x, y,
107 (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
108 (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
109 (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
110 (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
113 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
114 static Vc_ALWAYS_INLINE m256i Vc_CONST blend(param256i x, param256i y) {
115 return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
117 template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
118 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
119 static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) {
120 VC_STATIC_ASSERT(Dst0 >= X0 && Dst0 <= X7, Incorrect_Range);
121 VC_STATIC_ASSERT(Dst1 >= X0 && Dst1 <= X7, Incorrect_Range);
122 VC_STATIC_ASSERT(Dst2 >= X0 && Dst2 <= X7, Incorrect_Range);
123 VC_STATIC_ASSERT(Dst3 >= X0 && Dst3 <= X7, Incorrect_Range);
124 VC_STATIC_ASSERT(Dst4 >= X0 && Dst4 <= X7, Incorrect_Range);
125 VC_STATIC_ASSERT(Dst5 >= X0 && Dst5 <= X7, Incorrect_Range);
126 VC_STATIC_ASSERT(Dst6 >= X0 && Dst6 <= X7, Incorrect_Range);
127 VC_STATIC_ASSERT(Dst7 >= X0 && Dst7 <= X7, Incorrect_Range);
128 if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
129 return permute<Dst0, Dst1, Dst2, Dst3>(x);
131 const m128 loIn = _mm256_castps256_ps128(x);
132 const m128 hiIn = _mm256_extractf128_ps(x, 1);
135 if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
136 lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
137 } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) {
138 lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
139 } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) {
140 lo = shuffle<Dst0, Dst1, Dst2 - X4 + Y0, Dst3 - X4 + Y0>(loIn, hiIn);
141 } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) {
142 lo = shuffle<Dst0 - X4, Dst1 - X4, Dst2 + Y0, Dst3 + Y0>(hiIn, loIn);
143 } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) {
144 lo = _mm_unpacklo_ps(loIn, hiIn);
145 } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) {
146 lo = _mm_unpacklo_ps(hiIn, loIn);
147 } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) {
148 lo = _mm_unpackhi_ps(loIn, hiIn);
149 } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) {
150 lo = _mm_unpackhi_ps(hiIn, loIn);
151 } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) {
152 lo = blend<ScaleForBlend<Dst0>::Value, ScaleForBlend<Dst1>::Value,
153 ScaleForBlend<Dst2>::Value, ScaleForBlend<Dst3>::Value>(loIn, hiIn);
156 if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) {
157 hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
158 } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) {
159 hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
160 } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) {
161 hi = shuffle<Dst4, Dst5, Dst6 - X4 + Y0, Dst7 - X4 + Y0>(loIn, hiIn);
162 } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) {
163 hi = shuffle<Dst4 - X4, Dst5 - X4, Dst6 + Y0, Dst7 + Y0>(hiIn, loIn);
164 } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) {
165 hi = _mm_unpacklo_ps(loIn, hiIn);
166 } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) {
167 hi = _mm_unpacklo_ps(hiIn, loIn);
168 } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) {
169 hi = _mm_unpackhi_ps(loIn, hiIn);
170 } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) {
171 hi = _mm_unpackhi_ps(hiIn, loIn);
172 } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) {
173 hi = blend<ScaleForBlend<Dst4>::Value, ScaleForBlend<Dst5>::Value,
174 ScaleForBlend<Dst6>::Value, ScaleForBlend<Dst7>::Value>(loIn, hiIn);
177 return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
181 // little endian has the lo bits on the right and high bits on the left
182 // with vectors this becomes greatly confusing:
186 // The shuffles and permutes above use memory ordering. The ones below use register ordering:
189 template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256 Vc_CONST permute128(param256 x, param256 y) {
190 VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
191 VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
192 return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
194 template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256i Vc_CONST permute128(param256i x, param256i y) {
195 VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
196 VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
197 return _mm256_permute2f128_si256(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
199 template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE m256d Vc_CONST permute128(param256d x, param256d y) {
200 VC_STATIC_ASSERT(L >= X0 && H >= X0, Incorrect_Range);
201 VC_STATIC_ASSERT(L <= Y1 && H <= Y1, Incorrect_Range);
202 return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
204 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256d Vc_CONST permute(param256d x) {
205 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, Incorrect_Range);
206 VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
207 return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
209 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256 Vc_CONST permute(param256 x) {
210 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
211 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
212 return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
214 template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m128d Vc_CONST permute(param128d x) {
215 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0, Incorrect_Range);
216 VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= X1, Incorrect_Range);
217 return _mm_permute_pd(x, Dst0 + Dst1 * 2);
219 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m128 Vc_CONST permute(param128 x) {
220 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
221 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
222 return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
224 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256d Vc_CONST shuffle(param256d x, param256d y) {
225 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, Incorrect_Range);
226 VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, Incorrect_Range);
227 return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
229 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE m256 Vc_CONST shuffle(param256 x, param256 y) {
230 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
231 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
232 return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
236 } // namespace AliRoot
237 #include "undomacros.h"
239 #endif // VC_AVX_SHUFFLE_H