[u/mrichter/AliRoot.git] / Vc / include / Vc / sse / shuffle.h

/*  This file is part of the Vc library.

    Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>

    Vc is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as
    published by the Free Software Foundation, either version 3 of
    the License, or (at your option) any later version.

    Vc is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with Vc.  If not, see <http://www.gnu.org/licenses/>.

*/

#ifndef VC_SSE_SHUFFLE_H
#define VC_SSE_SHUFFLE_H

#include "macros.h"

namespace AliRoot {
namespace Vc
{
    enum VecPos {
        X0, X1, X2, X3, X4, X5, X6, X7,
        Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
    };

    namespace Mem
    {
        // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
            VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
            VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
            return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
        }

        // shuffle<X1, Y0>([x0 x1], [y0 y1]) = [x1 y0]
        template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
            VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0, Incorrect_Range);
            VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1, Incorrect_Range);
            return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
        }

        // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
        template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
            VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
            VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
#if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
            using Vc::SSE::_mm_blend_pd;
#endif
            return _mm_blend_pd(x, y, (Dst0 / Y0) + (Dst1 / Y0) * 2);
        }

        // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
            VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
            VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
            VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
            VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range);
#if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
            using Vc::SSE::_mm_blend_ps;
#endif
            return _mm_blend_ps(x, y,
                    (Dst0 / Y0) *  1 + (Dst1 / Y1) *  2 +
                    (Dst2 / Y2) *  4 + (Dst3 / Y3) *  8);
        }

        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
        static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) {
            VC_STATIC_ASSERT(Dst0 == X0 || Dst0 == Y0, Incorrect_Range);
            VC_STATIC_ASSERT(Dst1 == X1 || Dst1 == Y1, Incorrect_Range);
            VC_STATIC_ASSERT(Dst2 == X2 || Dst2 == Y2, Incorrect_Range);
            VC_STATIC_ASSERT(Dst3 == X3 || Dst3 == Y3, Incorrect_Range);
            VC_STATIC_ASSERT(Dst4 == X4 || Dst4 == Y4, Incorrect_Range);
            VC_STATIC_ASSERT(Dst5 == X5 || Dst5 == Y5, Incorrect_Range);
            VC_STATIC_ASSERT(Dst6 == X6 || Dst6 == Y6, Incorrect_Range);
            VC_STATIC_ASSERT(Dst7 == X7 || Dst7 == Y7, Incorrect_Range);
#if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
            using Vc::SSE::_mm_blend_epi16;
#endif
            return _mm_blend_epi16(x, y,
                    (Dst0 / Y0) *  1 + (Dst1 / Y1) *  2 +
                    (Dst2 / Y2) *  4 + (Dst3 / Y3) *  8 +
                    (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
                    (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
                    );
        }

        // permute<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
            VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
            VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
            return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }

        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
            VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
            VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
            return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }

        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) {
            VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
            VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
            return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }

        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) {
            VC_STATIC_ASSERT(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, Incorrect_Range);
            VC_STATIC_ASSERT(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, Incorrect_Range);
            return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
        }

        template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
            static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
            VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
            VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
            VC_STATIC_ASSERT(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, Incorrect_Range);
            VC_STATIC_ASSERT(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, Incorrect_Range);
            if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) {
                x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
            }
            if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) {
                x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
            }
            return x;
        }
    } // namespace Mem
    // The shuffles and permutes above use memory ordering. The ones below use register ordering:
    namespace Reg
    {
        // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
            return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
        }

        // shuffle<Y0, X1>([x1 x0], [y1 y0]) = [y0 x1]
        template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
            return Mem::shuffle<Dst0, Dst1>(x, y);
        }

        // shuffle<X3, X0, X2, X1>([x3 x2 x1 x0]) = [x3 x0 x2 x1]
        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
            VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
            VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
            return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
        }

        // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) {
            VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
            VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
            return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
        }

        // blend<Y1, X0>([x1 x0], [y1, y0]) = [x1 y0]
        template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
            return Mem::blend<Dst0, Dst1>(x, y);
        }

        template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
            return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
        }
    } // namespace Reg
} // namespace Vc
} // namespace AliRoot

#include "undomacros.h"

#endif // VC_SSE_SHUFFLE_H
Commit	Line	Data
f22341db	1	/* This file is part of the Vc library.
	2
	3	Copyright (C) 2011-2012 Matthias Kretz <kretz@kde.org>
	4
	5	Vc is free software: you can redistribute it and/or modify
	6	it under the terms of the GNU Lesser General Public License as
	7	published by the Free Software Foundation, either version 3 of
	8	the License, or (at your option) any later version.
	9
	10	Vc is distributed in the hope that it will be useful, but
	11	WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	GNU Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with Vc. If not, see <http://www.gnu.org/licenses/>.
	17
	18	*/
	19
	20	#ifndef VC_SSE_SHUFFLE_H
	21	#define VC_SSE_SHUFFLE_H
	22
c017a39f	23	#include "macros.h"
	24
	25	namespace AliRoot {
f22341db	26	namespace Vc
	27	{
	28	enum VecPos {
	29	X0, X1, X2, X3, X4, X5, X6, X7,
	30	Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
	31	};
	32
	33	namespace Mem
	34	{
	35	// shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
c017a39f	36	template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
f22341db	37	VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
	38	VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
	39	return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
	40	}
	41
	42	// shuffle<X1, Y0>([x0 x1], [y0 y1]) = [x1 y0]
c017a39f	43	template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
f22341db	44	VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= Y0, Incorrect_Range);
	45	VC_STATIC_ASSERT(Dst0 <= X1 && Dst1 <= Y1, Incorrect_Range);
	46	return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
	47	}
	48
	49	// blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
c017a39f	50	template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
f22341db	51	VC_STATIC_ASSERT(Dst0 == X0 \|\| Dst0 == Y0, Incorrect_Range);
	52	VC_STATIC_ASSERT(Dst1 == X1 \|\| Dst1 == Y1, Incorrect_Range);
	53	#if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
	54	using Vc::SSE::_mm_blend_pd;
	55	#endif
	56	return _mm_blend_pd(x, y, (Dst0 / Y0) + (Dst1 / Y0) * 2);
	57	}
	58
	59	// blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
c017a39f	60	template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
f22341db	61	VC_STATIC_ASSERT(Dst0 == X0 \|\| Dst0 == Y0, Incorrect_Range);
	62	VC_STATIC_ASSERT(Dst1 == X1 \|\| Dst1 == Y1, Incorrect_Range);
	63	VC_STATIC_ASSERT(Dst2 == X2 \|\| Dst2 == Y2, Incorrect_Range);
	64	VC_STATIC_ASSERT(Dst3 == X3 \|\| Dst3 == Y3, Incorrect_Range);
	65	#if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
	66	using Vc::SSE::_mm_blend_ps;
	67	#endif
	68	return _mm_blend_ps(x, y,
	69	(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
	70	(Dst2 / Y2) * 4 + (Dst3 / Y3) * 8);
	71	}
	72
	73	template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
c017a39f	74	static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) {
f22341db	75	VC_STATIC_ASSERT(Dst0 == X0 \|\| Dst0 == Y0, Incorrect_Range);
	76	VC_STATIC_ASSERT(Dst1 == X1 \|\| Dst1 == Y1, Incorrect_Range);
	77	VC_STATIC_ASSERT(Dst2 == X2 \|\| Dst2 == Y2, Incorrect_Range);
	78	VC_STATIC_ASSERT(Dst3 == X3 \|\| Dst3 == Y3, Incorrect_Range);
	79	VC_STATIC_ASSERT(Dst4 == X4 \|\| Dst4 == Y4, Incorrect_Range);
	80	VC_STATIC_ASSERT(Dst5 == X5 \|\| Dst5 == Y5, Incorrect_Range);
	81	VC_STATIC_ASSERT(Dst6 == X6 \|\| Dst6 == Y6, Incorrect_Range);
	82	VC_STATIC_ASSERT(Dst7 == X7 \|\| Dst7 == Y7, Incorrect_Range);
	83	#if !defined(VC_IMPL_SSE4_1) && !defined(VC_IMPL_AVX)
	84	using Vc::SSE::_mm_blend_epi16;
	85	#endif
	86	return _mm_blend_epi16(x, y,
	87	(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
	88	(Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
	89	(Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
	90	(Dst6 / Y6) * 64 + (Dst7 / Y7) *128
	91	);
	92	}
	93
	94	// permute<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
c017a39f	95	template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
f22341db	96	VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
	97	VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
	98	return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
	99	}
	100
c017a39f	101	template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
f22341db	102	VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
	103	VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
	104	return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
	105	}
	106
c017a39f	107	template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) {
f22341db	108	VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
	109	VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
	110	return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
	111	}
	112
c017a39f	113	template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) {
f22341db	114	VC_STATIC_ASSERT(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, Incorrect_Range);
	115	VC_STATIC_ASSERT(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, Incorrect_Range);
	116	return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
	117	}
	118
	119	template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
c017a39f	120	static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
f22341db	121	VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
	122	VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
	123	VC_STATIC_ASSERT(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, Incorrect_Range);
	124	VC_STATIC_ASSERT(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, Incorrect_Range);
	125	if (Dst0 != X0 \|\| Dst1 != X1 \|\| Dst2 != X2 \|\| Dst3 != X3) {
	126	x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
	127	}
	128	if (Dst4 != X4 \|\| Dst5 != X5 \|\| Dst6 != X6 \|\| Dst7 != X7) {
	129	x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
	130	}
	131	return x;
	132	}
	133	} // namespace Mem
	134	// The shuffles and permutes above use memory ordering. The ones below use register ordering:
	135	namespace Reg
	136	{
	137	// shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
c017a39f	138	template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
f22341db	139	return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
	140	}
	141
	142	// shuffle<Y0, X1>([x1 x0], [y1 y0]) = [y0 x1]
c017a39f	143	template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
f22341db	144	return Mem::shuffle<Dst0, Dst1>(x, y);
	145	}
	146
	147	// shuffle<X3, X0, X2, X1>([x3 x2 x1 x0]) = [x3 x0 x2 x1]
c017a39f	148	template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
f22341db	149	VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, Incorrect_Range);
	150	VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, Incorrect_Range);
	151	return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
	152	}
	153
	154	// shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
c017a39f	155	template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) {
f22341db	156	VC_STATIC_ASSERT(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, Incorrect_Range);
	157	VC_STATIC_ASSERT(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, Incorrect_Range);
	158	return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
	159	}
	160
	161	// blend<Y1, X0>([x1 x0], [y1, y0]) = [x1 y0]
c017a39f	162	template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
f22341db	163	return Mem::blend<Dst0, Dst1>(x, y);
	164	}
	165
c017a39f	166	template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
f22341db	167	return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
	168	}
	169	} // namespace Reg
	170	} // namespace Vc
c017a39f	171	} // namespace AliRoot
	172
	173	#include "undomacros.h"
f22341db	174
f22341db	175	#endif // VC_SSE_SHUFFLE_H