]> git.uio.no Git - u/mrichter/AliRoot.git/blame - Vc/include/Vc/sse/shuffle.h
update to Vc 0.7.3-dev
[u/mrichter/AliRoot.git] / Vc / include / Vc / sse / shuffle.h
CommitLineData
f22341db 1/* This file is part of the Vc library.
2
3 Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>
4
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
9
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17
18*/
19
20#ifndef VC_SSE_SHUFFLE_H
21#define VC_SSE_SHUFFLE_H
22
c017a39f 23#include "macros.h"
24
25namespace AliRoot {
f22341db 26namespace Vc
27{
28 enum VecPos {
29 X0, X1, X2, X3, X4, X5, X6, X7,
30 Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
31 };
32
33 namespace Mem
34 {
35 // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
c017a39f 36 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
f22341db 37 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
38 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
39 return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
40 }
41
42 // shuffle<X1, Y0>([x0 x1], [y0 y1]) = [x1 y0]
c017a39f 43 template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
f22341db 44 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0, Incorrect_Range);
45 VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1, Incorrect_Range);
46 return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
47 }
48
49 // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
c017a39f 50 template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
f22341db 51 VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
52 VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
53#if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
54 using Vc::SSE::_mm_blend_pd;
55#endif
56 return _mm_blend_pd(x, y, (Dst0 / Y0) + (Dst1 / Y0) * 2);
57 }
58
59 // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
c017a39f 60 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
f22341db 61 VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
62 VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
63 VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
64 VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range);
65#if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
66 using Vc::SSE::_mm_blend_ps;
67#endif
68 return _mm_blend_ps(x, y,
69 (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
70 (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8);
71 }
72
73 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
c017a39f 74 static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) {
f22341db 75 VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
76 VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
77 VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
78 VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range);
79 VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range);
80 VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range);
81 VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range);
82 VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range);
83#if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
84 using Vc::SSE::_mm_blend_epi16;
85#endif
86 return _mm_blend_epi16(x, y,
87 (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
88 (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
89 (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
90 (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
91 );
92 }
93
94 // permute<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
c017a39f 95 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
f22341db 96 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
97 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
98 return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
99 }
100
c017a39f 101 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
f22341db 102 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
103 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
104 return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
105 }
106
c017a39f 107 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) {
f22341db 108 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
109 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
110 return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
111 }
112
c017a39f 113 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) {
f22341db 114 VC_STATIC_ASSERT(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, Incorrect_Range);
115 VC_STATIC_ASSERT(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, Incorrect_Range);
116 return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
117 }
118
119 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
c017a39f 120 static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
f22341db 121 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
122 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
123 VC_STATIC_ASSERT(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, Incorrect_Range);
124 VC_STATIC_ASSERT(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, Incorrect_Range);
125 if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) {
126 x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
127 }
128 if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) {
129 x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
130 }
131 return x;
132 }
133 } // namespace Mem
134 // The shuffles and permutes above use memory ordering. The ones below use register ordering:
135 namespace Reg
136 {
137 // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
c017a39f 138 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
f22341db 139 return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
140 }
141
142 // shuffle<Y0, X1>([x1 x0], [y1 y0]) = [y0 x1]
c017a39f 143 template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
f22341db 144 return Mem::shuffle<Dst0, Dst1>(x, y);
145 }
146
147 // shuffle<X3, X0, X2, X1>([x3 x2 x1 x0]) = [x3 x0 x2 x1]
c017a39f 148 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
f22341db 149 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
150 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
151 return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
152 }
153
154 // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
c017a39f 155 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) {
f22341db 156 VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
157 VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
158 return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
159 }
160
161 // blend<Y1, X0>([x1 x0], [y1, y0]) = [x1 y0]
c017a39f 162 template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
f22341db 163 return Mem::blend<Dst0, Dst1>(x, y);
164 }
165
c017a39f 166 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
f22341db 167 return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
168 }
169 } // namespace Reg
170} // namespace Vc
c017a39f 171} // namespace AliRoot
172
173#include "undomacros.h"
f22341db 174
175#endif // VC_SSE_SHUFFLE_H