1 /* This file is part of the Vc library. {{{
3 Copyright (C) 2012 Matthias Kretz <kretz@kde.org>
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
20 #ifndef VC_SSE_INTERLEAVEDMEMORY_TCC
21 #define VC_SSE_INTERLEAVEDMEMORY_TCC
33 template<typename V, int Size> struct InterleaveImpl;
34 template<> struct InterleaveImpl<SSE::sfloat_v, 8> {
35 static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/
36 const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1)
38 const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]);
39 const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]);
40 const __m128 tmp2 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]);
41 const __m128 tmp3 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]);
43 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
44 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
45 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
46 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
47 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), tmp2);
48 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), tmp2);
49 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), tmp3);
50 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), tmp3);
52 static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/
53 const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1, const SSE::sfloat_v::AsArg v2)
55 #ifdef VC_USE_MASKMOV_SCATTER
56 const __m128i mask = _mm_set_epi32(0, -1, -1, -1);
58 const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]);
59 const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]);
60 const __m128 tmp2 = _mm_unpacklo_ps(v2.data()[0], v2.data()[0]);
61 const __m128 tmp3 = _mm_unpackhi_ps(v2.data()[0], v2.data()[0]);
62 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast<char *>(&data[i[0]]));
63 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast<char *>(&data[i[1]]));
64 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast<char *>(&data[i[2]]));
65 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast<char *>(&data[i[3]]));
67 const __m128 tmp8 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]);
68 const __m128 tmp9 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]);
69 const __m128 tmp10 = _mm_unpacklo_ps(v2.data()[1], v2.data()[1]);
70 const __m128 tmp11 = _mm_unpackhi_ps(v2.data()[1], v2.data()[1]);
71 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp8, tmp10)), mask, reinterpret_cast<char *>(&data[i[4]]));
72 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp10, tmp8)), mask, reinterpret_cast<char *>(&data[i[5]]));
73 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp9, tmp11)), mask, reinterpret_cast<char *>(&data[i[6]]));
74 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp11, tmp9)), mask, reinterpret_cast<char *>(&data[i[7]]));
76 interleave(data, i, v0, v1);
77 v2.scatter(data + 2, i);
80 static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/
81 const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1,
82 const SSE::sfloat_v::AsArg v2, const SSE::sfloat_v::AsArg v3)
84 const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]);
85 const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]);
86 const __m128 tmp2 = _mm_unpacklo_ps(v2.data()[0], v3.data()[0]);
87 const __m128 tmp3 = _mm_unpackhi_ps(v2.data()[0], v3.data()[0]);
88 _mm_storeu_ps(&data[i[0]], _mm_movelh_ps(tmp0, tmp2));
89 _mm_storeu_ps(&data[i[1]], _mm_movehl_ps(tmp2, tmp0));
90 _mm_storeu_ps(&data[i[2]], _mm_movelh_ps(tmp1, tmp3));
91 _mm_storeu_ps(&data[i[3]], _mm_movehl_ps(tmp3, tmp1));
93 const __m128 tmp8 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]);
94 const __m128 tmp9 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]);
95 const __m128 tmp10 = _mm_unpacklo_ps(v2.data()[1], v3.data()[1]);
96 const __m128 tmp11 = _mm_unpackhi_ps(v2.data()[1], v3.data()[1]);
97 _mm_storeu_ps(&data[i[4]], _mm_movelh_ps(tmp8, tmp10));
98 _mm_storeu_ps(&data[i[5]], _mm_movehl_ps(tmp10, tmp8));
99 _mm_storeu_ps(&data[i[6]], _mm_movelh_ps(tmp9, tmp11));
100 _mm_storeu_ps(&data[i[7]], _mm_movehl_ps(tmp11, tmp9));
103 template<typename V> struct InterleaveImpl<V, 8> {
104 static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
105 const typename V::AsArg v0, const typename V::AsArg v1)
107 const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
108 const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
110 const long long tmp00 = _mm_cvtsi128_si64(tmp0);
111 const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0));
112 const long long tmp10 = _mm_cvtsi128_si64(tmp1);
113 const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1));
114 *reinterpret_cast<int *>(&data[i[0]]) = tmp00;
115 *reinterpret_cast<int *>(&data[i[1]]) = tmp00 >> 32;
116 *reinterpret_cast<int *>(&data[i[2]]) = tmp01;
117 *reinterpret_cast<int *>(&data[i[3]]) = tmp01 >> 32;
118 *reinterpret_cast<int *>(&data[i[4]]) = tmp10;
119 *reinterpret_cast<int *>(&data[i[5]]) = tmp10 >> 32;
120 *reinterpret_cast<int *>(&data[i[6]]) = tmp11;
121 *reinterpret_cast<int *>(&data[i[7]]) = tmp11 >> 32;
122 #elif defined(VC_IMPL_SSE4_1)
123 *reinterpret_cast<int *>(&data[i[0]]) = _mm_cvtsi128_si32(tmp0);
124 *reinterpret_cast<int *>(&data[i[1]]) = _mm_extract_epi32(tmp0, 1);
125 *reinterpret_cast<int *>(&data[i[2]]) = _mm_extract_epi32(tmp0, 2);
126 *reinterpret_cast<int *>(&data[i[3]]) = _mm_extract_epi32(tmp0, 3);
127 *reinterpret_cast<int *>(&data[i[4]]) = _mm_cvtsi128_si32(tmp1);
128 *reinterpret_cast<int *>(&data[i[5]]) = _mm_extract_epi32(tmp1, 1);
129 *reinterpret_cast<int *>(&data[i[6]]) = _mm_extract_epi32(tmp1, 2);
130 *reinterpret_cast<int *>(&data[i[7]]) = _mm_extract_epi32(tmp1, 3);
132 *reinterpret_cast<int *>(&data[i[0]]) = _mm_cvtsi128_si32(tmp0);
133 *reinterpret_cast<int *>(&data[i[1]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 4));
134 *reinterpret_cast<int *>(&data[i[2]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 8));
135 *reinterpret_cast<int *>(&data[i[3]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 12));
136 *reinterpret_cast<int *>(&data[i[4]]) = _mm_cvtsi128_si32(tmp1);
137 *reinterpret_cast<int *>(&data[i[5]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 4));
138 *reinterpret_cast<int *>(&data[i[6]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 8));
139 *reinterpret_cast<int *>(&data[i[7]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 12));
142 static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
143 const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
145 #ifdef VC_USE_MASKMOV_SCATTER
146 const __m128i maskLo = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
147 const __m128i maskHi = _mm_set_epi16(0, -1, -1, -1, 0, 0, 0, 0);
148 typename V::EntryType *const dataHi = data - 4;
149 const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
150 const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
151 const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data());
152 const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data());
154 const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
155 const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
156 const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
157 const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
158 _mm_maskmoveu_si128(tmp4, maskLo, reinterpret_cast<char *>(&data[i[0]]));
159 _mm_maskmoveu_si128(tmp4, maskHi, reinterpret_cast<char *>(&dataHi[i[1]]));
160 _mm_maskmoveu_si128(tmp5, maskLo, reinterpret_cast<char *>(&data[i[2]]));
161 _mm_maskmoveu_si128(tmp5, maskHi, reinterpret_cast<char *>(&dataHi[i[3]]));
162 _mm_maskmoveu_si128(tmp6, maskLo, reinterpret_cast<char *>(&data[i[4]]));
163 _mm_maskmoveu_si128(tmp6, maskHi, reinterpret_cast<char *>(&dataHi[i[5]]));
164 _mm_maskmoveu_si128(tmp7, maskLo, reinterpret_cast<char *>(&data[i[6]]));
165 _mm_maskmoveu_si128(tmp7, maskHi, reinterpret_cast<char *>(&dataHi[i[7]]));
167 interleave(data, i, v0, v1);
168 v2.scatter(data + 2, i);
171 static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
172 const typename V::AsArg v0, const typename V::AsArg v1,
173 const typename V::AsArg v2, const typename V::AsArg v3)
175 const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
176 const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
177 const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
178 const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
180 const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
181 const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
182 const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
183 const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
185 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4);
186 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5);
187 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6);
188 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7);
189 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4));
190 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5));
191 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6));
192 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7));
195 template<typename V> struct InterleaveImpl<V, 4> {
196 static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
197 const typename V::AsArg v0, const typename V::AsArg v1)
199 const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
200 const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
201 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
202 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
203 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
204 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
206 static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
207 const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
209 #ifdef VC_USE_MASKMOV_SCATTER
210 const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
211 const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
212 const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
213 const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
214 const __m128i mask = _mm_set_epi32(0, -1, -1, -1);
215 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast<char *>(&data[i[0]]));
216 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast<char *>(&data[i[1]]));
217 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast<char *>(&data[i[2]]));
218 _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast<char *>(&data[i[3]]));
220 const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
221 const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
222 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
223 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
224 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
225 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
226 v2.scatter(data + 2, i);
229 static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
230 const typename V::AsArg v0, const typename V::AsArg v1,
231 const typename V::AsArg v2, const typename V::AsArg v3)
233 const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
234 const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
235 const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
236 const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
237 _mm_storeu_ps(reinterpret_cast<float *>(&data[i[0]]), _mm_movelh_ps(tmp0, tmp2));
238 _mm_storeu_ps(reinterpret_cast<float *>(&data[i[1]]), _mm_movehl_ps(tmp2, tmp0));
239 _mm_storeu_ps(reinterpret_cast<float *>(&data[i[2]]), _mm_movelh_ps(tmp1, tmp3));
240 _mm_storeu_ps(reinterpret_cast<float *>(&data[i[3]]), _mm_movehl_ps(tmp3, tmp1));
243 template<typename V> struct InterleaveImpl<V, 2> {
244 static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
245 const typename V::AsArg v0, const typename V::AsArg v1)
247 const __m128d tmp0 = _mm_unpacklo_pd(v0.data(), v1.data());
248 const __m128d tmp1 = _mm_unpackhi_pd(v0.data(), v1.data());
249 _mm_storeu_pd(&data[i[0]], tmp0);
250 _mm_storeu_pd(&data[i[1]], tmp1);
252 static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
253 const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
255 interleave(data, i, v0, v1);
256 v2.scatter(data + 2, i);
258 static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
259 const typename V::AsArg v0, const typename V::AsArg v1,
260 const typename V::AsArg v2, const typename V::AsArg v3)
262 interleave(data, i, v0, v1);
263 interleave(data + 2, i, v2, v3);
266 } // anonymous namespace
268 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
269 const typename V::AsArg v1)
271 InterleaveImpl<V, V::Size>::interleave(m_data, m_indexes, v0, v1);
273 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
274 const typename V::AsArg v1, const typename V::AsArg v2)
276 InterleaveImpl<V, V::Size>::interleave(m_data, m_indexes, v0, v1, v2);
278 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
279 const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3)
281 InterleaveImpl<V, V::Size>::interleave(m_data, m_indexes, v0, v1, v2, v3);
283 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
284 const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4)
286 InterleaveImpl<V, V::Size>::interleave(m_data, m_indexes, v0, v1, v2, v3);
287 v4.scatter(m_data + 4, m_indexes);
289 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
290 const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4,
291 const typename V::AsArg v5)
293 InterleaveImpl<V, V::Size>::interleave(m_data , m_indexes, v0, v1, v2, v3);
294 InterleaveImpl<V, V::Size>::interleave(m_data + 4, m_indexes, v4, v5);
296 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
297 const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4,
298 const typename V::AsArg v5, const typename V::AsArg v6)
300 InterleaveImpl<V, V::Size>::interleave(m_data + 0, m_indexes, v0, v1, v2, v3);
301 InterleaveImpl<V, V::Size>::interleave(m_data + 4, m_indexes, v4, v5, v6);
303 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
304 const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4,
305 const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7)
307 InterleaveImpl<V, V::Size>::interleave(m_data + 0, m_indexes, v0, v1, v2, v3);
308 InterleaveImpl<V, V::Size>::interleave(m_data + 4, m_indexes, v4, v5, v6, v7);
311 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1) const/*{{{*/
313 const __m128 a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[0]])));
314 const __m128 b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[1]])));
315 const __m128 c = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[2]])));
316 const __m128 d = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[3]])));
318 const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
319 const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
321 v0.data() = _mm_movelh_ps(tmp0, tmp1);
322 v1.data() = _mm_movehl_ps(tmp1, tmp0);
325 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2) const/*{{{*/
327 const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]);
328 const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]);
329 const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]);
330 const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]);
332 const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
333 const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
334 const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 XX XX]
335 const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 XX XX]
337 v0.data() = _mm_movelh_ps(tmp0, tmp1);
338 v1.data() = _mm_movehl_ps(tmp1, tmp0);
339 v2.data() = _mm_movelh_ps(tmp2, tmp3);
342 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3) const/*{{{*/
344 const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]);
345 const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]);
346 const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]);
347 const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]);
349 const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
350 const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
351 const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1]
352 const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3]
354 v0.data() = _mm_movelh_ps(tmp0, tmp1);
355 v1.data() = _mm_movehl_ps(tmp1, tmp0);
356 v2.data() = _mm_movelh_ps(tmp2, tmp3);
357 v3.data() = _mm_movehl_ps(tmp3, tmp2);
360 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4) const/*{{{*/
362 v4.gather(m_data, m_indexes + I(4));
363 deinterleave(v0, v1, v2, v3);
366 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5) const/*{{{*/
368 const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]);
369 const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]);
370 const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]);
371 const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]);
373 const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
374 const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1]
375 const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1]
377 const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]);
378 const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]);
379 const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]);
380 const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]);
382 const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
383 v0.data() = _mm_movelh_ps(tmp0, tmp1);
384 v1.data() = _mm_movehl_ps(tmp1, tmp0);
386 const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3]
387 v2.data() = _mm_movelh_ps(tmp2, tmp3);
388 v3.data() = _mm_movehl_ps(tmp3, tmp2);
390 const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3]
391 v4.data() = _mm_movelh_ps(tmp4, tmp5);
392 v5.data() = _mm_movehl_ps(tmp5, tmp4);
395 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6) const/*{{{*/
397 const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]);
398 const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]);
399 const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]);
400 const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]);
402 const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
403 const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1]
404 const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1]
405 const __m128 tmp6 = _mm_unpackhi_ps(e, f); // [c0 c1 d0 d1]
407 const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]);
408 const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]);
409 const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]);
410 const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]);
412 const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
413 v0.data() = _mm_movelh_ps(tmp0, tmp1);
414 v1.data() = _mm_movehl_ps(tmp1, tmp0);
416 const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3]
417 v2.data() = _mm_movelh_ps(tmp2, tmp3);
418 v3.data() = _mm_movehl_ps(tmp3, tmp2);
420 const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3]
421 v4.data() = _mm_movelh_ps(tmp4, tmp5);
422 v5.data() = _mm_movehl_ps(tmp5, tmp4);
424 const __m128 tmp7 = _mm_unpackhi_ps(g, h); // [c2 c3 d2 d3]
425 v6.data() = _mm_movelh_ps(tmp6, tmp7);
428 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6, float_v &v7) const/*{{{*/
430 const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]);
431 const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]);
432 const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]);
433 const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]);
435 const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
436 const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1]
437 const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1]
438 const __m128 tmp6 = _mm_unpackhi_ps(e, f); // [c0 c1 d0 d1]
440 const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]);
441 const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]);
442 const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]);
443 const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]);
445 const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
446 v0.data() = _mm_movelh_ps(tmp0, tmp1);
447 v1.data() = _mm_movehl_ps(tmp1, tmp0);
449 const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3]
450 v2.data() = _mm_movelh_ps(tmp2, tmp3);
451 v3.data() = _mm_movehl_ps(tmp3, tmp2);
453 const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3]
454 v4.data() = _mm_movelh_ps(tmp4, tmp5);
455 v5.data() = _mm_movehl_ps(tmp5, tmp4);
457 const __m128 tmp7 = _mm_unpackhi_ps(g, h); // [c2 c3 d2 d3]
458 v6.data() = _mm_movelh_ps(tmp6, tmp7);
459 v7.data() = _mm_movehl_ps(tmp7, tmp6);
462 static inline void _sse_deinterleave_double(const double *VC_RESTRICT data, const uint_v &indexes, double_v &v0, double_v &v1)/*{{{*/
464 const __m128d a = _mm_loadu_pd(&data[indexes[0]]);
465 const __m128d b = _mm_loadu_pd(&data[indexes[1]]);
467 v0.data() = _mm_unpacklo_pd(a, b);
468 v1.data() = _mm_unpackhi_pd(a, b);
470 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1) const {/*{{{*/
471 _sse_deinterleave_double(m_data, m_indexes, v0, v1);
474 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
475 double_v &v2) const {
476 v2.gather(m_data + 2, m_indexes);
477 _sse_deinterleave_double(m_data, m_indexes, v0, v1);
480 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
481 double_v &v2, double_v &v3) const {
482 _sse_deinterleave_double(m_data , m_indexes, v0, v1);
483 _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3);
486 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
487 double_v &v2, double_v &v3, double_v &v4) const {
488 v4.gather(m_data + 4, m_indexes);
489 _sse_deinterleave_double(m_data , m_indexes, v0, v1);
490 _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3);
493 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
494 double_v &v2, double_v &v3, double_v &v4, double_v &v5) const {
495 _sse_deinterleave_double(m_data , m_indexes, v0, v1);
496 _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3);
497 _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5);
500 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
501 double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6) const {
502 v6.gather(m_data + 6, m_indexes);
503 _sse_deinterleave_double(m_data , m_indexes, v0, v1);
504 _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3);
505 _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5);
508 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
509 double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6, double_v &v7) const {
510 _sse_deinterleave_double(m_data , m_indexes, v0, v1);
511 _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3);
512 _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5);
513 _sse_deinterleave_double(m_data + 6, m_indexes, v6, v7);
516 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1) const {/*{{{*/
517 const __m128i a = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[0]]));
518 const __m128i b = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[1]]));
519 const __m128i c = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[2]]));
520 const __m128i d = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[3]]));
521 const __m128i e = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[4]]));
522 const __m128i f = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[5]]));
523 const __m128i g = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[6]]));
524 const __m128i h = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[7]]));
526 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
527 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
528 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
529 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
531 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
532 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
534 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
535 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
538 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
540 const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
541 const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
542 const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
543 const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
544 const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
545 const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
546 const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
547 const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
549 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
550 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
551 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
552 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
554 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
555 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
556 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
557 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
559 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
560 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
561 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
563 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
564 short_v &v2, short_v &v3) const {
565 const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
566 const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
567 const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
568 const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
569 const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
570 const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
571 const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
572 const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
574 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
575 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
576 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
577 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
579 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
580 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
581 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
582 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
584 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
585 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
586 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
587 v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
589 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
590 short_v &v2, short_v &v3, short_v &v4) const {
591 const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
592 const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
593 const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
594 const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
595 const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
596 const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
597 const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
598 const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
600 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
601 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
602 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
603 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
604 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
605 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
606 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
607 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
609 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
610 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
611 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
612 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
613 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
614 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
616 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
617 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
618 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
619 v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
620 v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
622 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
623 short_v &v2, short_v &v3, short_v &v4, short_v &v5) const {
624 const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
625 const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
626 const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
627 const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
628 const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
629 const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
630 const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
631 const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
633 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
634 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
635 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
636 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
637 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
638 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
639 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
640 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
642 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
643 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
644 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
645 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
646 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
647 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
649 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
650 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
651 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
652 v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
653 v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
654 v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
656 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
657 short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6) const {
658 const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
659 const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
660 const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
661 const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
662 const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
663 const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
664 const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
665 const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
667 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
668 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
669 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
670 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
671 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
672 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
673 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
674 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
676 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
677 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
678 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
679 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
680 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
681 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
682 const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
683 const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7
685 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
686 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
687 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
688 v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
689 v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
690 v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
691 v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
693 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
694 short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6, short_v &v7) const {
695 const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
696 const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
697 const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
698 const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
699 const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
700 const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
701 const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
702 const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
704 const __m128i tmp2 = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
705 const __m128i tmp4 = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
706 const __m128i tmp3 = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
707 const __m128i tmp5 = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
708 const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
709 const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
710 const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
711 const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
713 const __m128i tmp0 = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
714 const __m128i tmp1 = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
715 const __m128i tmp6 = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
716 const __m128i tmp7 = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
717 const __m128i tmp8 = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
718 const __m128i tmp9 = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
719 const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
720 const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7
722 v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
723 v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
724 v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
725 v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
726 v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
727 v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
728 v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
729 v7.data() = _mm_unpackhi_epi16(tmp14, tmp15);
732 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1) const/*{{{*/
734 const __m128 i0a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[0]])));
735 const __m128 i1a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[1]])));
736 const __m128 i2a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[2]])));
737 const __m128 i3a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[3]])));
738 const __m128 i4a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[4]])));
739 const __m128 i5a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[5]])));
740 const __m128 i6a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[6]])));
741 const __m128 i7a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[7]])));
743 const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
744 const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
745 const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
746 const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
747 v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
748 v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
751 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2) const/*{{{*/
753 const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
754 const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
755 const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
756 const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
757 const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
758 const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
759 const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
760 const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
762 const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
763 const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
764 const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
765 const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
766 v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
767 v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
769 const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
770 const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
771 const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
772 const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
773 v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
776 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3) const/*{{{*/
778 const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
779 const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
780 const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
781 const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
782 const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
783 const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
784 const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
785 const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
787 const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
788 const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
789 const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
790 const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
791 v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
792 v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
794 const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
795 const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
796 const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
797 const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
798 v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
799 v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45));
802 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4) const/*{{{*/
804 const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
805 const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
806 const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
807 const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
808 const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
809 const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
810 const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
811 const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
812 v4.gather(m_data + float_v::Size, m_indexes);
814 const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
815 const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
816 const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
817 const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
818 v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
819 v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
821 const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
822 const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
823 const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
824 const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
825 v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
826 v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45));
829 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5) const/*{{{*/
831 const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
832 const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
833 const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
834 const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
835 const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
836 const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
837 const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
838 const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
839 const __m128 i0b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[0] + float_v::Size])));
840 const __m128 i1b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[1] + float_v::Size])));
841 const __m128 i2b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[2] + float_v::Size])));
842 const __m128 i3b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[3] + float_v::Size])));
843 const __m128 i4b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[4] + float_v::Size])));
844 const __m128 i5b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[5] + float_v::Size])));
845 const __m128 i6b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[6] + float_v::Size])));
846 const __m128 i7b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[7] + float_v::Size])));
848 const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
849 const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
850 const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
851 const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
852 v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
853 v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
855 const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
856 const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
857 const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
858 const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
859 v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
860 v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45));
862 const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1]
863 const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3]
864 const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5]
865 const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7]
866 v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67));
867 v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45));
870 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6) const/*{{{*/
872 const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
873 const __m128 i0b = _mm_loadu_ps(&m_data[m_indexes[0] + float_v::Size]);
874 const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
875 const __m128 i1b = _mm_loadu_ps(&m_data[m_indexes[1] + float_v::Size]);
876 const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
877 const __m128 i2b = _mm_loadu_ps(&m_data[m_indexes[2] + float_v::Size]);
878 const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
879 const __m128 i3b = _mm_loadu_ps(&m_data[m_indexes[3] + float_v::Size]);
880 const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
881 const __m128 i4b = _mm_loadu_ps(&m_data[m_indexes[4] + float_v::Size]);
882 const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
883 const __m128 i5b = _mm_loadu_ps(&m_data[m_indexes[5] + float_v::Size]);
884 const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
885 const __m128 i6b = _mm_loadu_ps(&m_data[m_indexes[6] + float_v::Size]);
886 const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
887 const __m128 i7b = _mm_loadu_ps(&m_data[m_indexes[7] + float_v::Size]);
889 const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
890 const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
891 const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
892 const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
893 v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
894 v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
896 const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
897 const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
898 const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
899 const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
900 v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
901 v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45));
903 const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1]
904 const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3]
905 const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5]
906 const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7]
907 v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67));
908 v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45));
910 const __m128 gh01 = _mm_unpackhi_ps(i0b, i1b); // [g0 g1 h0 h1]
911 const __m128 gh23 = _mm_unpackhi_ps(i2b, i3b); // [g2 g3 h2 h3]
912 const __m128 gh45 = _mm_unpackhi_ps(i4b, i5b); // [g4 g5 h4 h5]
913 const __m128 gh67 = _mm_unpackhi_ps(i6b, i7b); // [g6 g7 h6 h7]
914 v6.data() = Vc::SSE::M256::create(_mm_movelh_ps(gh01, gh23), _mm_movelh_ps(gh45, gh67));
917 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6, sfloat_v &v7) const/*{{{*/
919 const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
920 const __m128 i0b = _mm_loadu_ps(&m_data[m_indexes[0] + float_v::Size]);
921 const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
922 const __m128 i1b = _mm_loadu_ps(&m_data[m_indexes[1] + float_v::Size]);
923 const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
924 const __m128 i2b = _mm_loadu_ps(&m_data[m_indexes[2] + float_v::Size]);
925 const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
926 const __m128 i3b = _mm_loadu_ps(&m_data[m_indexes[3] + float_v::Size]);
927 const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
928 const __m128 i4b = _mm_loadu_ps(&m_data[m_indexes[4] + float_v::Size]);
929 const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
930 const __m128 i5b = _mm_loadu_ps(&m_data[m_indexes[5] + float_v::Size]);
931 const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
932 const __m128 i6b = _mm_loadu_ps(&m_data[m_indexes[6] + float_v::Size]);
933 const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
934 const __m128 i7b = _mm_loadu_ps(&m_data[m_indexes[7] + float_v::Size]);
936 const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
937 const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
938 const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
939 const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
940 v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
941 v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
943 const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
944 const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
945 const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
946 const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
947 v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
948 v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45));
950 const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1]
951 const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3]
952 const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5]
953 const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7]
954 v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67));
955 v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45));
957 const __m128 gh01 = _mm_unpackhi_ps(i0b, i1b); // [g0 g1 h0 h1]
958 const __m128 gh23 = _mm_unpackhi_ps(i2b, i3b); // [g2 g3 h2 h3]
959 const __m128 gh45 = _mm_unpackhi_ps(i4b, i5b); // [g4 g5 h4 h5]
960 const __m128 gh67 = _mm_unpackhi_ps(i6b, i7b); // [g6 g7 h6 h7]
961 v6.data() = Vc::SSE::M256::create(_mm_movelh_ps(gh01, gh23), _mm_movelh_ps(gh45, gh67));
962 v7.data() = Vc::SSE::M256::create(_mm_movehl_ps(gh23, gh01), _mm_movehl_ps(gh67, gh45));
965 // forward types of equal size - ugly, but it works/*{{{*/
966 #define _forward(V, V2) \
967 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1) const { \
968 reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1)); \
970 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2) const { \
971 reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
972 reinterpret_cast<V2 &>(v2)); \
974 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2, V &v3) const { \
975 reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
976 reinterpret_cast<V2 &>(v2), reinterpret_cast<V2 &>(v3)); \
978 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2, V &v3, \
980 reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
981 reinterpret_cast<V2 &>(v2), reinterpret_cast<V2 &>(v3), reinterpret_cast<V2 &>(v4)); \
983 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2, V &v3, \
984 V &v4, V &v5) const { \
985 reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
986 reinterpret_cast<V2 &>(v2), reinterpret_cast<V2 &>(v3), reinterpret_cast<V2 &>(v4), \
987 reinterpret_cast<V2 &>(v5)); \
989 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2, V &v3, \
990 V &v4, V &v5, V &v6) const { \
991 reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
992 reinterpret_cast<V2 &>(v2), reinterpret_cast<V2 &>(v3), reinterpret_cast<V2 &>(v4), \
993 reinterpret_cast<V2 &>(v5), reinterpret_cast<V2 &>(v6)); \
995 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2, V &v3, \
996 V &v4, V &v5, V &v6, V &v7) const { \
997 reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
998 reinterpret_cast<V2 &>(v2), reinterpret_cast<V2 &>(v3), reinterpret_cast<V2 &>(v4), \
999 reinterpret_cast<V2 &>(v5), reinterpret_cast<V2 &>(v6), reinterpret_cast<V2 &>(v7)); \
1001 _forward( int_v, float_v)
1002 _forward(uint_v, float_v)
1003 _forward(ushort_v, short_v)
1004 #undef _forward/*}}}*/
1006 } // namespace Common
1008 } // namespace AliRoot
1010 #include "undomacros.h"
1012 #endif // VC_SSE_INTERLEAVEDMEMORY_TCC
1014 // vim: foldmethod=marker