Vc/include/Vc/sse/interleavedmemory.tcc

   1 /*  This file is part of the Vc library. {{{
   2
   3     Copyright (C) 2012 Matthias Kretz <kretz@kde.org>
   4
   5     Vc is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU Lesser General Public License as
   7     published by the Free Software Foundation, either version 3 of
   8     the License, or (at your option) any later version.
   9
  10     Vc is distributed in the hope that it will be useful, but
  11     WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with Vc.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 }}}*/
  19
  20 #ifndef VC_SSE_INTERLEAVEDMEMORY_TCC
  21 #define VC_SSE_INTERLEAVEDMEMORY_TCC
  22
  23 #include "macros.h"
  24
  25 namespace AliRoot {
  26 namespace Vc
  27 {
  28 namespace Common
  29 {
  30
  31 namespace
  32 {
  33 template<typename V, int Size> struct InterleaveImpl;
  34 template<> struct InterleaveImpl<SSE::sfloat_v, 8> {
  35     static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/
  36             const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1)
  37     {
  38         const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]);
  39         const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]);
  40         const __m128 tmp2 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]);
  41         const __m128 tmp3 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]);
  42
  43         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
  44         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
  45         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
  46         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
  47         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), tmp2);
  48         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), tmp2);
  49         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), tmp3);
  50         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), tmp3);
  51     }/*}}}*/
  52     static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/
  53             const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1, const SSE::sfloat_v::AsArg v2)
  54     {
  55 #ifdef VC_USE_MASKMOV_SCATTER
  56         const __m128i mask = _mm_set_epi32(0, -1, -1, -1);
  57
  58         const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]);
  59         const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]);
  60         const __m128 tmp2 = _mm_unpacklo_ps(v2.data()[0], v2.data()[0]);
  61         const __m128 tmp3 = _mm_unpackhi_ps(v2.data()[0], v2.data()[0]);
  62         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast<char *>(&data[i[0]]));
  63         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast<char *>(&data[i[1]]));
  64         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast<char *>(&data[i[2]]));
  65         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast<char *>(&data[i[3]]));
  66
  67         const __m128 tmp8 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]);
  68         const __m128 tmp9 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]);
  69         const __m128 tmp10 = _mm_unpacklo_ps(v2.data()[1], v2.data()[1]);
  70         const __m128 tmp11 = _mm_unpackhi_ps(v2.data()[1], v2.data()[1]);
  71         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp8, tmp10)), mask, reinterpret_cast<char *>(&data[i[4]]));
  72         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp10, tmp8)), mask, reinterpret_cast<char *>(&data[i[5]]));
  73         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp9, tmp11)), mask, reinterpret_cast<char *>(&data[i[6]]));
  74         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp11, tmp9)), mask, reinterpret_cast<char *>(&data[i[7]]));
  75 #else
  76         interleave(data, i, v0, v1);
  77         v2.scatter(data + 2, i);
  78 #endif
  79     }/*}}}*/
  80     static inline void interleave(float *const data, const SSE::sfloat_v::IndexType &i,/*{{{*/
  81             const SSE::sfloat_v::AsArg v0, const SSE::sfloat_v::AsArg v1,
  82             const SSE::sfloat_v::AsArg v2, const SSE::sfloat_v::AsArg v3)
  83     {
  84         const __m128 tmp0 = _mm_unpacklo_ps(v0.data()[0], v1.data()[0]);
  85         const __m128 tmp1 = _mm_unpackhi_ps(v0.data()[0], v1.data()[0]);
  86         const __m128 tmp2 = _mm_unpacklo_ps(v2.data()[0], v3.data()[0]);
  87         const __m128 tmp3 = _mm_unpackhi_ps(v2.data()[0], v3.data()[0]);
  88         _mm_storeu_ps(&data[i[0]], _mm_movelh_ps(tmp0, tmp2));
  89         _mm_storeu_ps(&data[i[1]], _mm_movehl_ps(tmp2, tmp0));
  90         _mm_storeu_ps(&data[i[2]], _mm_movelh_ps(tmp1, tmp3));
  91         _mm_storeu_ps(&data[i[3]], _mm_movehl_ps(tmp3, tmp1));
  92
  93         const __m128 tmp8 = _mm_unpacklo_ps(v0.data()[1], v1.data()[1]);
  94         const __m128 tmp9 = _mm_unpackhi_ps(v0.data()[1], v1.data()[1]);
  95         const __m128 tmp10 = _mm_unpacklo_ps(v2.data()[1], v3.data()[1]);
  96         const __m128 tmp11 = _mm_unpackhi_ps(v2.data()[1], v3.data()[1]);
  97         _mm_storeu_ps(&data[i[4]], _mm_movelh_ps(tmp8, tmp10));
  98         _mm_storeu_ps(&data[i[5]], _mm_movehl_ps(tmp10, tmp8));
  99         _mm_storeu_ps(&data[i[6]], _mm_movelh_ps(tmp9, tmp11));
 100         _mm_storeu_ps(&data[i[7]], _mm_movehl_ps(tmp11, tmp9));
 101     }/*}}}*/
 102 };
 103 template<typename V> struct InterleaveImpl<V, 8> {
 104     static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
 105             const typename V::AsArg v0, const typename V::AsArg v1)
 106     {
 107         const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v1.data());
 108         const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v1.data());
 109 #ifdef __x86_64__
 110         const long long tmp00 = _mm_cvtsi128_si64(tmp0);
 111         const long long tmp01 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp0, tmp0));
 112         const long long tmp10 = _mm_cvtsi128_si64(tmp1);
 113         const long long tmp11 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(tmp1, tmp1));
 114         *reinterpret_cast<int *>(&data[i[0]]) = tmp00;
 115         *reinterpret_cast<int *>(&data[i[1]]) = tmp00 >> 32;
 116         *reinterpret_cast<int *>(&data[i[2]]) = tmp01;
 117         *reinterpret_cast<int *>(&data[i[3]]) = tmp01 >> 32;
 118         *reinterpret_cast<int *>(&data[i[4]]) = tmp10;
 119         *reinterpret_cast<int *>(&data[i[5]]) = tmp10 >> 32;
 120         *reinterpret_cast<int *>(&data[i[6]]) = tmp11;
 121         *reinterpret_cast<int *>(&data[i[7]]) = tmp11 >> 32;
 122 #elif defined(VC_IMPL_SSE4_1)
 123         *reinterpret_cast<int *>(&data[i[0]]) = _mm_cvtsi128_si32(tmp0);
 124         *reinterpret_cast<int *>(&data[i[1]]) = _mm_extract_epi32(tmp0, 1);
 125         *reinterpret_cast<int *>(&data[i[2]]) = _mm_extract_epi32(tmp0, 2);
 126         *reinterpret_cast<int *>(&data[i[3]]) = _mm_extract_epi32(tmp0, 3);
 127         *reinterpret_cast<int *>(&data[i[4]]) = _mm_cvtsi128_si32(tmp1);
 128         *reinterpret_cast<int *>(&data[i[5]]) = _mm_extract_epi32(tmp1, 1);
 129         *reinterpret_cast<int *>(&data[i[6]]) = _mm_extract_epi32(tmp1, 2);
 130         *reinterpret_cast<int *>(&data[i[7]]) = _mm_extract_epi32(tmp1, 3);
 131 #else
 132         *reinterpret_cast<int *>(&data[i[0]]) = _mm_cvtsi128_si32(tmp0);
 133         *reinterpret_cast<int *>(&data[i[1]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 4));
 134         *reinterpret_cast<int *>(&data[i[2]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 8));
 135         *reinterpret_cast<int *>(&data[i[3]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp0, 12));
 136         *reinterpret_cast<int *>(&data[i[4]]) = _mm_cvtsi128_si32(tmp1);
 137         *reinterpret_cast<int *>(&data[i[5]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 4));
 138         *reinterpret_cast<int *>(&data[i[6]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 8));
 139         *reinterpret_cast<int *>(&data[i[7]]) = _mm_cvtsi128_si32(_mm_srli_si128(tmp1, 12));
 140 #endif
 141     }/*}}}*/
 142     static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
 143             const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
 144     {
 145 #ifdef VC_USE_MASKMOV_SCATTER
 146         const __m128i maskLo = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
 147         const __m128i maskHi = _mm_set_epi16(0, -1, -1, -1, 0, 0, 0, 0);
 148         typename V::EntryType *const dataHi = data - 4;
 149         const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
 150         const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
 151         const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v1.data());
 152         const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v1.data());
 153
 154         const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
 155         const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
 156         const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
 157         const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
 158         _mm_maskmoveu_si128(tmp4, maskLo, reinterpret_cast<char *>(&data[i[0]]));
 159         _mm_maskmoveu_si128(tmp4, maskHi, reinterpret_cast<char *>(&dataHi[i[1]]));
 160         _mm_maskmoveu_si128(tmp5, maskLo, reinterpret_cast<char *>(&data[i[2]]));
 161         _mm_maskmoveu_si128(tmp5, maskHi, reinterpret_cast<char *>(&dataHi[i[3]]));
 162         _mm_maskmoveu_si128(tmp6, maskLo, reinterpret_cast<char *>(&data[i[4]]));
 163         _mm_maskmoveu_si128(tmp6, maskHi, reinterpret_cast<char *>(&dataHi[i[5]]));
 164         _mm_maskmoveu_si128(tmp7, maskLo, reinterpret_cast<char *>(&data[i[6]]));
 165         _mm_maskmoveu_si128(tmp7, maskHi, reinterpret_cast<char *>(&dataHi[i[7]]));
 166 #else
 167         interleave(data, i, v0, v1);
 168         v2.scatter(data + 2, i);
 169 #endif
 170     }/*}}}*/
 171     static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
 172             const typename V::AsArg v0, const typename V::AsArg v1,
 173             const typename V::AsArg v2, const typename V::AsArg v3)
 174     {
 175         const __m128i tmp0 = _mm_unpacklo_epi16(v0.data(), v2.data());
 176         const __m128i tmp1 = _mm_unpackhi_epi16(v0.data(), v2.data());
 177         const __m128i tmp2 = _mm_unpacklo_epi16(v1.data(), v3.data());
 178         const __m128i tmp3 = _mm_unpackhi_epi16(v1.data(), v3.data());
 179
 180         const __m128i tmp4 = _mm_unpacklo_epi16(tmp0, tmp2);
 181         const __m128i tmp5 = _mm_unpackhi_epi16(tmp0, tmp2);
 182         const __m128i tmp6 = _mm_unpacklo_epi16(tmp1, tmp3);
 183         const __m128i tmp7 = _mm_unpackhi_epi16(tmp1, tmp3);
 184
 185         _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[0]]), tmp4);
 186         _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[2]]), tmp5);
 187         _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[4]]), tmp6);
 188         _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[6]]), tmp7);
 189         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), _mm_castsi128_ps(tmp4));
 190         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), _mm_castsi128_ps(tmp5));
 191         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), _mm_castsi128_ps(tmp6));
 192         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), _mm_castsi128_ps(tmp7));
 193     }/*}}}*/
 194 };
 195 template<typename V> struct InterleaveImpl<V, 4> {
 196     static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
 197             const typename V::AsArg v0, const typename V::AsArg v1)
 198     {
 199         const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
 200         const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
 201         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
 202         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
 203         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
 204         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
 205     }/*}}}*/
 206     static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
 207             const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
 208     {
 209 #ifdef VC_USE_MASKMOV_SCATTER
 210         const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
 211         const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()), SSE::sse_cast<__m128>(v1.data()));
 212         const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
 213         const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()), SSE::sse_cast<__m128>(v2.data()));
 214         const __m128i mask = _mm_set_epi32(0, -1, -1, -1);
 215         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp0, tmp2)), mask, reinterpret_cast<char *>(&data[i[0]]));
 216         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp2, tmp0)), mask, reinterpret_cast<char *>(&data[i[1]]));
 217         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movelh_ps(tmp1, tmp3)), mask, reinterpret_cast<char *>(&data[i[2]]));
 218         _mm_maskmoveu_si128(_mm_castps_si128(_mm_movehl_ps(tmp3, tmp1)), mask, reinterpret_cast<char *>(&data[i[3]]));
 219 #else
 220         const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
 221         const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
 222         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), tmp0);
 223         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), tmp0);
 224         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), tmp1);
 225         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), tmp1);
 226         v2.scatter(data + 2, i);
 227 #endif
 228     }/*}}}*/
 229     static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
 230             const typename V::AsArg v0, const typename V::AsArg v1,
 231             const typename V::AsArg v2, const typename V::AsArg v3)
 232     {
 233         const __m128 tmp0 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
 234         const __m128 tmp1 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v0.data()),SSE::sse_cast<__m128>(v1.data()));
 235         const __m128 tmp2 = _mm_unpacklo_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
 236         const __m128 tmp3 = _mm_unpackhi_ps(SSE::sse_cast<__m128>(v2.data()),SSE::sse_cast<__m128>(v3.data()));
 237         _mm_storeu_ps(reinterpret_cast<float *>(&data[i[0]]), _mm_movelh_ps(tmp0, tmp2));
 238         _mm_storeu_ps(reinterpret_cast<float *>(&data[i[1]]), _mm_movehl_ps(tmp2, tmp0));
 239         _mm_storeu_ps(reinterpret_cast<float *>(&data[i[2]]), _mm_movelh_ps(tmp1, tmp3));
 240         _mm_storeu_ps(reinterpret_cast<float *>(&data[i[3]]), _mm_movehl_ps(tmp3, tmp1));
 241     }/*}}}*/
 242 };
 243 template<typename V> struct InterleaveImpl<V, 2> {
 244     static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
 245             const typename V::AsArg v0, const typename V::AsArg v1)
 246     {
 247         const __m128d tmp0 = _mm_unpacklo_pd(v0.data(), v1.data());
 248         const __m128d tmp1 = _mm_unpackhi_pd(v0.data(), v1.data());
 249         _mm_storeu_pd(&data[i[0]], tmp0);
 250         _mm_storeu_pd(&data[i[1]], tmp1);
 251     }/*}}}*/
 252     static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
 253             const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
 254     {
 255         interleave(data, i, v0, v1);
 256         v2.scatter(data + 2, i);
 257     }/*}}}*/
 258     static inline void interleave(typename V::EntryType *const data, const typename V::IndexType &i,/*{{{*/
 259             const typename V::AsArg v0, const typename V::AsArg v1,
 260             const typename V::AsArg v2, const typename V::AsArg v3)
 261     {
 262         interleave(data, i, v0, v1);
 263         interleave(data + 2, i, v2, v3);
 264     }/*}}}*/
 265 };
 266 } // anonymous namespace
 267
 268 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
 269         const typename V::AsArg v1)
 270 {
 271     InterleaveImpl<V, V::Size>::interleave(m_data, m_indexes, v0, v1);
 272 }/*}}}*/
 273 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
 274         const typename V::AsArg v1, const typename V::AsArg v2)
 275 {
 276     InterleaveImpl<V, V::Size>::interleave(m_data, m_indexes, v0, v1, v2);
 277 }/*}}}*/
 278 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
 279         const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3)
 280 {
 281     InterleaveImpl<V, V::Size>::interleave(m_data, m_indexes, v0, v1, v2, v3);
 282 }/*}}}*/
 283 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
 284         const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4)
 285 {
 286     InterleaveImpl<V, V::Size>::interleave(m_data, m_indexes, v0, v1, v2, v3);
 287     v4.scatter(m_data + 4, m_indexes);
 288 }/*}}}*/
 289 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
 290         const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4,
 291         const typename V::AsArg v5)
 292 {
 293     InterleaveImpl<V, V::Size>::interleave(m_data    , m_indexes, v0, v1, v2, v3);
 294     InterleaveImpl<V, V::Size>::interleave(m_data + 4, m_indexes, v4, v5);
 295 }/*}}}*/
 296 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
 297         const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4,
 298         const typename V::AsArg v5, const typename V::AsArg v6)
 299 {
 300     InterleaveImpl<V, V::Size>::interleave(m_data + 0, m_indexes, v0, v1, v2, v3);
 301     InterleaveImpl<V, V::Size>::interleave(m_data + 4, m_indexes, v4, v5, v6);
 302 }/*}}}*/
 303 template<typename V> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::interleave(const typename V::AsArg v0,/*{{{*/
 304         const typename V::AsArg v1, const typename V::AsArg v2, const typename V::AsArg v3, const typename V::AsArg v4,
 305         const typename V::AsArg v5, const typename V::AsArg v6, const typename V::AsArg v7)
 306 {
 307     InterleaveImpl<V, V::Size>::interleave(m_data + 0, m_indexes, v0, v1, v2, v3);
 308     InterleaveImpl<V, V::Size>::interleave(m_data + 4, m_indexes, v4, v5, v6, v7);
 309 }/*}}}*/
 310
 311 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1) const/*{{{*/
 312 {
 313     const __m128 a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[0]])));
 314     const __m128 b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[1]])));
 315     const __m128 c = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[2]])));
 316     const __m128 d = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[3]])));
 317
 318     const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
 319     const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
 320
 321     v0.data() = _mm_movelh_ps(tmp0, tmp1);
 322     v1.data() = _mm_movehl_ps(tmp1, tmp0);
 323 }
 324 /*}}}*/
 325 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2) const/*{{{*/
 326 {
 327     const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]);
 328     const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]);
 329     const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]);
 330     const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]);
 331
 332     const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
 333     const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
 334     const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 XX XX]
 335     const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 XX XX]
 336
 337     v0.data() = _mm_movelh_ps(tmp0, tmp1);
 338     v1.data() = _mm_movehl_ps(tmp1, tmp0);
 339     v2.data() = _mm_movelh_ps(tmp2, tmp3);
 340 }
 341 /*}}}*/
 342 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3) const/*{{{*/
 343 {
 344     const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]);
 345     const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]);
 346     const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]);
 347     const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]);
 348
 349     const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
 350     const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
 351     const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1]
 352     const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3]
 353
 354     v0.data() = _mm_movelh_ps(tmp0, tmp1);
 355     v1.data() = _mm_movehl_ps(tmp1, tmp0);
 356     v2.data() = _mm_movelh_ps(tmp2, tmp3);
 357     v3.data() = _mm_movehl_ps(tmp3, tmp2);
 358 }
 359 /*}}}*/
 360 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4) const/*{{{*/
 361 {
 362     v4.gather(m_data, m_indexes + I(4));
 363     deinterleave(v0, v1, v2, v3);
 364 }
 365 /*}}}*/
 366 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5) const/*{{{*/
 367 {
 368     const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]);
 369     const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]);
 370     const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]);
 371     const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]);
 372
 373     const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
 374     const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1]
 375     const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1]
 376
 377     const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]);
 378     const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]);
 379     const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]);
 380     const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]);
 381
 382     const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
 383     v0.data() = _mm_movelh_ps(tmp0, tmp1);
 384     v1.data() = _mm_movehl_ps(tmp1, tmp0);
 385
 386     const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3]
 387     v2.data() = _mm_movelh_ps(tmp2, tmp3);
 388     v3.data() = _mm_movehl_ps(tmp3, tmp2);
 389
 390     const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3]
 391     v4.data() = _mm_movelh_ps(tmp4, tmp5);
 392     v5.data() = _mm_movehl_ps(tmp5, tmp4);
 393 }
 394 /*}}}*/
 395 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6) const/*{{{*/
 396 {
 397     const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]);
 398     const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]);
 399     const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]);
 400     const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]);
 401
 402     const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
 403     const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1]
 404     const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1]
 405     const __m128 tmp6 = _mm_unpackhi_ps(e, f); // [c0 c1 d0 d1]
 406
 407     const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]);
 408     const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]);
 409     const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]);
 410     const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]);
 411
 412     const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
 413     v0.data() = _mm_movelh_ps(tmp0, tmp1);
 414     v1.data() = _mm_movehl_ps(tmp1, tmp0);
 415
 416     const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3]
 417     v2.data() = _mm_movelh_ps(tmp2, tmp3);
 418     v3.data() = _mm_movehl_ps(tmp3, tmp2);
 419
 420     const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3]
 421     v4.data() = _mm_movelh_ps(tmp4, tmp5);
 422     v5.data() = _mm_movehl_ps(tmp5, tmp4);
 423
 424     const __m128 tmp7 = _mm_unpackhi_ps(g, h); // [c2 c3 d2 d3]
 425     v6.data() = _mm_movelh_ps(tmp6, tmp7);
 426 }
 427 /*}}}*/
 428 template<> inline void InterleavedMemoryAccessBase<float_v>::deinterleave(float_v &v0, float_v &v1, float_v &v2, float_v &v3, float_v &v4, float_v &v5, float_v &v6, float_v &v7) const/*{{{*/
 429 {
 430     const __m128 a = _mm_loadu_ps(&m_data[m_indexes[0]]);
 431     const __m128 e = _mm_loadu_ps(&m_data[4 + m_indexes[0]]);
 432     const __m128 b = _mm_loadu_ps(&m_data[m_indexes[1]]);
 433     const __m128 f = _mm_loadu_ps(&m_data[4 + m_indexes[1]]);
 434
 435     const __m128 tmp0 = _mm_unpacklo_ps(a, b); // [a0 a1 b0 b1]
 436     const __m128 tmp2 = _mm_unpackhi_ps(a, b); // [c0 c1 d0 d1]
 437     const __m128 tmp4 = _mm_unpacklo_ps(e, f); // [a0 a1 b0 b1]
 438     const __m128 tmp6 = _mm_unpackhi_ps(e, f); // [c0 c1 d0 d1]
 439
 440     const __m128 c = _mm_loadu_ps(&m_data[m_indexes[2]]);
 441     const __m128 g = _mm_loadu_ps(&m_data[4 + m_indexes[2]]);
 442     const __m128 d = _mm_loadu_ps(&m_data[m_indexes[3]]);
 443     const __m128 h = _mm_loadu_ps(&m_data[4 + m_indexes[3]]);
 444
 445     const __m128 tmp1 = _mm_unpacklo_ps(c, d); // [a2 a3 b2 b3]
 446     v0.data() = _mm_movelh_ps(tmp0, tmp1);
 447     v1.data() = _mm_movehl_ps(tmp1, tmp0);
 448
 449     const __m128 tmp3 = _mm_unpackhi_ps(c, d); // [c2 c3 d2 d3]
 450     v2.data() = _mm_movelh_ps(tmp2, tmp3);
 451     v3.data() = _mm_movehl_ps(tmp3, tmp2);
 452
 453     const __m128 tmp5 = _mm_unpacklo_ps(g, h); // [a2 a3 b2 b3]
 454     v4.data() = _mm_movelh_ps(tmp4, tmp5);
 455     v5.data() = _mm_movehl_ps(tmp5, tmp4);
 456
 457     const __m128 tmp7 = _mm_unpackhi_ps(g, h); // [c2 c3 d2 d3]
 458     v6.data() = _mm_movelh_ps(tmp6, tmp7);
 459     v7.data() = _mm_movehl_ps(tmp7, tmp6);
 460 }/*}}}*/
 461
 462 static inline void _sse_deinterleave_double(const double *VC_RESTRICT data, const uint_v &indexes, double_v &v0, double_v &v1)/*{{{*/
 463 {
 464     const __m128d a = _mm_loadu_pd(&data[indexes[0]]);
 465     const __m128d b = _mm_loadu_pd(&data[indexes[1]]);
 466
 467     v0.data() = _mm_unpacklo_pd(a, b);
 468     v1.data() = _mm_unpackhi_pd(a, b);
 469 }/*}}}*/
 470 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1) const {/*{{{*/
 471     _sse_deinterleave_double(m_data, m_indexes, v0, v1);
 472 }
 473 /*}}}*/
 474 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
 475         double_v &v2) const {
 476     v2.gather(m_data + 2, m_indexes);
 477     _sse_deinterleave_double(m_data, m_indexes, v0, v1);
 478 }
 479 /*}}}*/
 480 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
 481         double_v &v2, double_v &v3) const {
 482     _sse_deinterleave_double(m_data    , m_indexes, v0, v1);
 483     _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3);
 484 }
 485 /*}}}*/
 486 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
 487         double_v &v2, double_v &v3, double_v &v4) const {
 488     v4.gather(m_data + 4, m_indexes);
 489     _sse_deinterleave_double(m_data    , m_indexes, v0, v1);
 490     _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3);
 491 }
 492 /*}}}*/
 493 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
 494         double_v &v2, double_v &v3, double_v &v4, double_v &v5) const {
 495     _sse_deinterleave_double(m_data    , m_indexes, v0, v1);
 496     _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3);
 497     _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5);
 498 }
 499 /*}}}*/
 500 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
 501         double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6) const {
 502     v6.gather(m_data + 6, m_indexes);
 503     _sse_deinterleave_double(m_data    , m_indexes, v0, v1);
 504     _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3);
 505     _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5);
 506 }
 507 /*}}}*/
 508 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<double_v>::deinterleave(double_v &v0, double_v &v1,/*{{{*/
 509         double_v &v2, double_v &v3, double_v &v4, double_v &v5, double_v &v6, double_v &v7) const {
 510     _sse_deinterleave_double(m_data    , m_indexes, v0, v1);
 511     _sse_deinterleave_double(m_data + 2, m_indexes, v2, v3);
 512     _sse_deinterleave_double(m_data + 4, m_indexes, v4, v5);
 513     _sse_deinterleave_double(m_data + 6, m_indexes, v6, v7);
 514 }/*}}}*/
 515
 516 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1) const {/*{{{*/
 517     const __m128i a = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[0]]));
 518     const __m128i b = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[1]]));
 519     const __m128i c = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[2]]));
 520     const __m128i d = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[3]]));
 521     const __m128i e = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[4]]));
 522     const __m128i f = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[5]]));
 523     const __m128i g = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[6]]));
 524     const __m128i h = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(&m_data[m_indexes[7]]));
 525
 526     const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
 527     const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
 528     const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
 529     const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
 530
 531     const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
 532     const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
 533
 534     v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
 535     v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
 536 }
 537 /*}}}*/
 538 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
 539         short_v &v2) const {
 540     const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
 541     const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
 542     const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
 543     const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
 544     const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
 545     const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
 546     const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
 547     const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
 548
 549     const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
 550     const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
 551     const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
 552     const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
 553
 554     const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
 555     const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
 556     const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
 557     const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
 558
 559     v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
 560     v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
 561     v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
 562 }/*}}}*/
 563 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
 564         short_v &v2, short_v &v3) const {
 565     const __m128i a = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
 566     const __m128i b = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
 567     const __m128i c = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
 568     const __m128i d = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
 569     const __m128i e = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
 570     const __m128i f = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
 571     const __m128i g = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
 572     const __m128i h = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
 573
 574     const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
 575     const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
 576     const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
 577     const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
 578
 579     const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
 580     const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
 581     const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
 582     const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
 583
 584     v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
 585     v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
 586     v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
 587     v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
 588 }/*}}}*/
 589 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
 590         short_v &v2, short_v &v3, short_v &v4) const {
 591     const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
 592     const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
 593     const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
 594     const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
 595     const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
 596     const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
 597     const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
 598     const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
 599
 600     const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
 601     const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
 602     const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
 603     const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
 604     const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
 605     const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
 606     const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
 607     const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
 608
 609     const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
 610     const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
 611     const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
 612     const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
 613     const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
 614     const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
 615
 616     v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
 617     v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
 618     v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
 619     v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
 620     v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
 621 }/*}}}*/
 622 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
 623         short_v &v2, short_v &v3, short_v &v4, short_v &v5) const {
 624     const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
 625     const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
 626     const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
 627     const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
 628     const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
 629     const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
 630     const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
 631     const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
 632
 633     const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
 634     const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
 635     const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
 636     const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
 637     const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
 638     const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
 639     const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
 640     const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
 641
 642     const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
 643     const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
 644     const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
 645     const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
 646     const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
 647     const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
 648
 649     v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
 650     v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
 651     v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
 652     v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
 653     v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
 654     v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
 655 }/*}}}*/
 656 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
 657         short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6) const {
 658     const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
 659     const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
 660     const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
 661     const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
 662     const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
 663     const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
 664     const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
 665     const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
 666
 667     const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
 668     const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
 669     const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
 670     const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
 671     const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
 672     const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
 673     const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
 674     const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
 675
 676     const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
 677     const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
 678     const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
 679     const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
 680     const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
 681     const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
 682     const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
 683     const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7
 684
 685     v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
 686     v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
 687     v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
 688     v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
 689     v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
 690     v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
 691     v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
 692 }/*}}}*/
 693 template<> inline void InterleavedMemoryAccessBase<short_v>::deinterleave(short_v &v0, short_v &v1,/*{{{*/
 694         short_v &v2, short_v &v3, short_v &v4, short_v &v5, short_v &v6, short_v &v7) const {
 695     const __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[0]]));
 696     const __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[1]]));
 697     const __m128i c = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[2]]));
 698     const __m128i d = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[3]]));
 699     const __m128i e = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[4]]));
 700     const __m128i f = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[5]]));
 701     const __m128i g = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[6]]));
 702     const __m128i h = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&m_data[m_indexes[7]]));
 703
 704     const __m128i tmp2  = _mm_unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4
 705     const __m128i tmp4  = _mm_unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
 706     const __m128i tmp3  = _mm_unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
 707     const __m128i tmp5  = _mm_unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
 708     const __m128i tmp10 = _mm_unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
 709     const __m128i tmp11 = _mm_unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
 710     const __m128i tmp12 = _mm_unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
 711     const __m128i tmp13 = _mm_unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
 712
 713     const __m128i tmp0  = _mm_unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6
 714     const __m128i tmp1  = _mm_unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
 715     const __m128i tmp6  = _mm_unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
 716     const __m128i tmp7  = _mm_unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
 717     const __m128i tmp8  = _mm_unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
 718     const __m128i tmp9  = _mm_unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
 719     const __m128i tmp14 = _mm_unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
 720     const __m128i tmp15 = _mm_unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7
 721
 722     v0.data() = _mm_unpacklo_epi16(tmp0, tmp1);
 723     v1.data() = _mm_unpackhi_epi16(tmp0, tmp1);
 724     v2.data() = _mm_unpacklo_epi16(tmp6, tmp7);
 725     v3.data() = _mm_unpackhi_epi16(tmp6, tmp7);
 726     v4.data() = _mm_unpacklo_epi16(tmp8, tmp9);
 727     v5.data() = _mm_unpackhi_epi16(tmp8, tmp9);
 728     v6.data() = _mm_unpacklo_epi16(tmp14, tmp15);
 729     v7.data() = _mm_unpackhi_epi16(tmp14, tmp15);
 730 }/*}}}*/
 731
 732 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1) const/*{{{*/
 733 {
 734     const __m128 i0a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[0]])));
 735     const __m128 i1a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[1]])));
 736     const __m128 i2a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[2]])));
 737     const __m128 i3a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[3]])));
 738     const __m128 i4a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[4]])));
 739     const __m128 i5a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[5]])));
 740     const __m128 i6a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[6]])));
 741     const __m128 i7a = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[7]])));
 742
 743     const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
 744     const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
 745     const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
 746     const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
 747     v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
 748     v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
 749 }
 750 /*}}}*/
 751 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2) const/*{{{*/
 752 {
 753     const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
 754     const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
 755     const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
 756     const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
 757     const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
 758     const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
 759     const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
 760     const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
 761
 762     const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
 763     const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
 764     const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
 765     const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
 766     v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
 767     v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
 768
 769     const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
 770     const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
 771     const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
 772     const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
 773     v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
 774 }
 775 /*}}}*/
 776 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3) const/*{{{*/
 777 {
 778     const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
 779     const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
 780     const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
 781     const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
 782     const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
 783     const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
 784     const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
 785     const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
 786
 787     const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
 788     const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
 789     const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
 790     const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
 791     v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
 792     v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
 793
 794     const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
 795     const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
 796     const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
 797     const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
 798     v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
 799     v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45));
 800 }
 801 /*}}}*/
 802 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4) const/*{{{*/
 803 {
 804     const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
 805     const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
 806     const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
 807     const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
 808     const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
 809     const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
 810     const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
 811     const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
 812     v4.gather(m_data + float_v::Size, m_indexes);
 813
 814     const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
 815     const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
 816     const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
 817     const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
 818     v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
 819     v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
 820
 821     const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
 822     const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
 823     const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
 824     const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
 825     v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
 826     v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45));
 827 }
 828 /*}}}*/
 829 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5) const/*{{{*/
 830 {
 831     const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
 832     const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
 833     const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
 834     const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
 835     const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
 836     const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
 837     const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
 838     const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
 839     const __m128 i0b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[0] + float_v::Size])));
 840     const __m128 i1b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[1] + float_v::Size])));
 841     const __m128 i2b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[2] + float_v::Size])));
 842     const __m128 i3b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[3] + float_v::Size])));
 843     const __m128 i4b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[4] + float_v::Size])));
 844     const __m128 i5b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[5] + float_v::Size])));
 845     const __m128 i6b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[6] + float_v::Size])));
 846     const __m128 i7b = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double *>(&m_data[m_indexes[7] + float_v::Size])));
 847
 848     const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
 849     const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
 850     const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
 851     const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
 852     v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
 853     v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
 854
 855     const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
 856     const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
 857     const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
 858     const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
 859     v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
 860     v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45));
 861
 862     const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1]
 863     const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3]
 864     const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5]
 865     const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7]
 866     v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67));
 867     v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45));
 868 }
 869 /*}}}*/
 870 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6) const/*{{{*/
 871 {
 872     const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
 873     const __m128 i0b = _mm_loadu_ps(&m_data[m_indexes[0] + float_v::Size]);
 874     const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
 875     const __m128 i1b = _mm_loadu_ps(&m_data[m_indexes[1] + float_v::Size]);
 876     const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
 877     const __m128 i2b = _mm_loadu_ps(&m_data[m_indexes[2] + float_v::Size]);
 878     const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
 879     const __m128 i3b = _mm_loadu_ps(&m_data[m_indexes[3] + float_v::Size]);
 880     const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
 881     const __m128 i4b = _mm_loadu_ps(&m_data[m_indexes[4] + float_v::Size]);
 882     const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
 883     const __m128 i5b = _mm_loadu_ps(&m_data[m_indexes[5] + float_v::Size]);
 884     const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
 885     const __m128 i6b = _mm_loadu_ps(&m_data[m_indexes[6] + float_v::Size]);
 886     const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
 887     const __m128 i7b = _mm_loadu_ps(&m_data[m_indexes[7] + float_v::Size]);
 888
 889     const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
 890     const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
 891     const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
 892     const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
 893     v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
 894     v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
 895
 896     const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
 897     const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
 898     const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
 899     const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
 900     v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
 901     v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45));
 902
 903     const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1]
 904     const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3]
 905     const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5]
 906     const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7]
 907     v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67));
 908     v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45));
 909
 910     const __m128 gh01 = _mm_unpackhi_ps(i0b, i1b); // [g0 g1 h0 h1]
 911     const __m128 gh23 = _mm_unpackhi_ps(i2b, i3b); // [g2 g3 h2 h3]
 912     const __m128 gh45 = _mm_unpackhi_ps(i4b, i5b); // [g4 g5 h4 h5]
 913     const __m128 gh67 = _mm_unpackhi_ps(i6b, i7b); // [g6 g7 h6 h7]
 914     v6.data() = Vc::SSE::M256::create(_mm_movelh_ps(gh01, gh23), _mm_movelh_ps(gh45, gh67));
 915 }
 916 /*}}}*/
 917 template<> inline void InterleavedMemoryAccessBase<sfloat_v>::deinterleave(sfloat_v &v0, sfloat_v &v1, sfloat_v &v2, sfloat_v &v3, sfloat_v &v4, sfloat_v &v5, sfloat_v &v6, sfloat_v &v7) const/*{{{*/
 918 {
 919     const __m128 i0a = _mm_loadu_ps(&m_data[m_indexes[0]]);
 920     const __m128 i0b = _mm_loadu_ps(&m_data[m_indexes[0] + float_v::Size]);
 921     const __m128 i1a = _mm_loadu_ps(&m_data[m_indexes[1]]);
 922     const __m128 i1b = _mm_loadu_ps(&m_data[m_indexes[1] + float_v::Size]);
 923     const __m128 i2a = _mm_loadu_ps(&m_data[m_indexes[2]]);
 924     const __m128 i2b = _mm_loadu_ps(&m_data[m_indexes[2] + float_v::Size]);
 925     const __m128 i3a = _mm_loadu_ps(&m_data[m_indexes[3]]);
 926     const __m128 i3b = _mm_loadu_ps(&m_data[m_indexes[3] + float_v::Size]);
 927     const __m128 i4a = _mm_loadu_ps(&m_data[m_indexes[4]]);
 928     const __m128 i4b = _mm_loadu_ps(&m_data[m_indexes[4] + float_v::Size]);
 929     const __m128 i5a = _mm_loadu_ps(&m_data[m_indexes[5]]);
 930     const __m128 i5b = _mm_loadu_ps(&m_data[m_indexes[5] + float_v::Size]);
 931     const __m128 i6a = _mm_loadu_ps(&m_data[m_indexes[6]]);
 932     const __m128 i6b = _mm_loadu_ps(&m_data[m_indexes[6] + float_v::Size]);
 933     const __m128 i7a = _mm_loadu_ps(&m_data[m_indexes[7]]);
 934     const __m128 i7b = _mm_loadu_ps(&m_data[m_indexes[7] + float_v::Size]);
 935
 936     const __m128 ab01 = _mm_unpacklo_ps(i0a, i1a); // [a0 a1 b0 b1]
 937     const __m128 ab23 = _mm_unpacklo_ps(i2a, i3a); // [a2 a3 b2 b3]
 938     const __m128 ab45 = _mm_unpacklo_ps(i4a, i5a); // [a4 a5 b4 b5]
 939     const __m128 ab67 = _mm_unpacklo_ps(i6a, i7a); // [a6 a7 b6 b7]
 940     v0.data() = Vc::SSE::M256::create(_mm_movelh_ps(ab01, ab23), _mm_movelh_ps(ab45, ab67));
 941     v1.data() = Vc::SSE::M256::create(_mm_movehl_ps(ab23, ab01), _mm_movehl_ps(ab67, ab45));
 942
 943     const __m128 cd01 = _mm_unpackhi_ps(i0a, i1a); // [c0 c1 d0 d1]
 944     const __m128 cd23 = _mm_unpackhi_ps(i2a, i3a); // [c2 c3 d2 d3]
 945     const __m128 cd45 = _mm_unpackhi_ps(i4a, i5a); // [c4 c5 d4 d5]
 946     const __m128 cd67 = _mm_unpackhi_ps(i6a, i7a); // [c6 c7 d6 d7]
 947     v2.data() = Vc::SSE::M256::create(_mm_movelh_ps(cd01, cd23), _mm_movelh_ps(cd45, cd67));
 948     v3.data() = Vc::SSE::M256::create(_mm_movehl_ps(cd23, cd01), _mm_movehl_ps(cd67, cd45));
 949
 950     const __m128 ef01 = _mm_unpacklo_ps(i0b, i1b); // [e0 e1 f0 f1]
 951     const __m128 ef23 = _mm_unpacklo_ps(i2b, i3b); // [e2 e3 f2 f3]
 952     const __m128 ef45 = _mm_unpacklo_ps(i4b, i5b); // [e4 e5 f4 f5]
 953     const __m128 ef67 = _mm_unpacklo_ps(i6b, i7b); // [e6 e7 f6 f7]
 954     v4.data() = Vc::SSE::M256::create(_mm_movelh_ps(ef01, ef23), _mm_movelh_ps(ef45, ef67));
 955     v5.data() = Vc::SSE::M256::create(_mm_movehl_ps(ef23, ef01), _mm_movehl_ps(ef67, ef45));
 956
 957     const __m128 gh01 = _mm_unpackhi_ps(i0b, i1b); // [g0 g1 h0 h1]
 958     const __m128 gh23 = _mm_unpackhi_ps(i2b, i3b); // [g2 g3 h2 h3]
 959     const __m128 gh45 = _mm_unpackhi_ps(i4b, i5b); // [g4 g5 h4 h5]
 960     const __m128 gh67 = _mm_unpackhi_ps(i6b, i7b); // [g6 g7 h6 h7]
 961     v6.data() = Vc::SSE::M256::create(_mm_movelh_ps(gh01, gh23), _mm_movelh_ps(gh45, gh67));
 962     v7.data() = Vc::SSE::M256::create(_mm_movehl_ps(gh23, gh01), _mm_movehl_ps(gh67, gh45));
 963 }/*}}}*/
 964
 965 // forward types of equal size - ugly, but it works/*{{{*/
 966 #define _forward(V, V2) \
 967 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1) const { \
 968     reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1)); \
 969 } \
 970 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2) const { \
 971     reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
 972             reinterpret_cast<V2 &>(v2)); \
 973 } \
 974 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2, V &v3) const { \
 975     reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
 976             reinterpret_cast<V2 &>(v2), reinterpret_cast<V2 &>(v3)); \
 977 } \
 978 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2, V &v3, \
 979         V &v4) const { \
 980     reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
 981             reinterpret_cast<V2 &>(v2), reinterpret_cast<V2 &>(v3), reinterpret_cast<V2 &>(v4)); \
 982 } \
 983 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2, V &v3, \
 984         V &v4, V &v5) const { \
 985     reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
 986             reinterpret_cast<V2 &>(v2), reinterpret_cast<V2 &>(v3), reinterpret_cast<V2 &>(v4), \
 987             reinterpret_cast<V2 &>(v5)); \
 988 } \
 989 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2, V &v3, \
 990         V &v4, V &v5, V &v6) const { \
 991     reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
 992             reinterpret_cast<V2 &>(v2), reinterpret_cast<V2 &>(v3), reinterpret_cast<V2 &>(v4), \
 993             reinterpret_cast<V2 &>(v5), reinterpret_cast<V2 &>(v6)); \
 994 } \
 995 template<> Vc_ALWAYS_INLINE void InterleavedMemoryAccessBase<V>::deinterleave(V &v0, V &v1, V &v2, V &v3, \
 996         V &v4, V &v5, V &v6, V &v7) const { \
 997     reinterpret_cast<const InterleavedMemoryAccessBase<V2> *>(this)->deinterleave(reinterpret_cast<V2 &>(v0), reinterpret_cast<V2 &>(v1), \
 998             reinterpret_cast<V2 &>(v2), reinterpret_cast<V2 &>(v3), reinterpret_cast<V2 &>(v4), \
 999             reinterpret_cast<V2 &>(v5), reinterpret_cast<V2 &>(v6), reinterpret_cast<V2 &>(v7)); \
1000 }
1001 _forward( int_v, float_v)
1002 _forward(uint_v, float_v)
1003 _forward(ushort_v, short_v)
1004 #undef _forward/*}}}*/
1005
1006 } // namespace Common
1007 } // namespace Vc
1008 } // namespace AliRoot
1009
1010 #include "undomacros.h"
1011
1012 #endif // VC_SSE_INTERLEAVEDMEMORY_TCC
1013
1014 // vim: foldmethod=marker