template<typename Flags> struct LoadHelper<int, unsigned short, Flags> {
static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags)
{
- return _mm_cvtepu16_epi32( _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
+ return mm_cvtepu16_epi32( _mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
};
template<typename Flags> struct LoadHelper<int, short, Flags> {
static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const short *mem, Flags)
{
- return _mm_cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
+ return mm_cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
};
template<typename Flags> struct LoadHelper<int, unsigned char, Flags> {
static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
{
- return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
+ return mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
}
};
template<typename Flags> struct LoadHelper<int, signed char, Flags> {
static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags)
{
- return _mm_cvtepi8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
+ return mm_cvtepi8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
}
};
template<typename Flags> struct LoadHelper<unsigned int, unsigned short, Flags> {
static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags)
{
- return _mm_cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
+ return mm_cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
};
template<typename Flags> struct LoadHelper<unsigned int, unsigned char, Flags> {
static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
{
- return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
+ return mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem)));
}
};
template<typename Flags> struct LoadHelper<short, unsigned char, Flags> {
static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
{
- return _mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
+ return mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
};
template<typename Flags> struct LoadHelper<short, signed char, Flags> {
static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags)
{
- return _mm_cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
+ return mm_cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
};
template<typename Flags> struct LoadHelper<unsigned short, unsigned char, Flags> {
static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags)
{
- return _mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
+ return mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
}
};
// shifted / rotated {{{1
template<typename T> Vc_INTRINSIC Vc_PURE Vector<T> Vector<T>::shifted(int amount) const
{
+ enum {
+ EntryTypeSizeof = sizeof(EntryType)
+ };
switch (amount) {
case 0: return *this;
- case 1: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * sizeof(EntryType)));
- case 2: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * sizeof(EntryType)));
- case 3: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * sizeof(EntryType)));
- case 4: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * sizeof(EntryType)));
- case 5: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * sizeof(EntryType)));
- case 6: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * sizeof(EntryType)));
- case 7: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * sizeof(EntryType)));
- case 8: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * sizeof(EntryType)));
- case -1: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * sizeof(EntryType)));
- case -2: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * sizeof(EntryType)));
- case -3: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * sizeof(EntryType)));
- case -4: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * sizeof(EntryType)));
- case -5: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * sizeof(EntryType)));
- case -6: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * sizeof(EntryType)));
- case -7: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * sizeof(EntryType)));
- case -8: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * sizeof(EntryType)));
+ case 1: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
+ case 2: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
+ case 3: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
+ case 4: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
+ case 5: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
+ case 6: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
+ case 7: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
+ case 8: return mm128_reinterpret_cast<VectorType>(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
+ case -1: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
+ case -2: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
+ case -3: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
+ case -4: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
+ case -5: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
+ case -6: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
+ case -7: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
+ case -8: return mm128_reinterpret_cast<VectorType>(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
}
return Zero();
}
template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::shifted(int amount) const
{
+ enum {
+ EntryTypeSizeof = sizeof(EntryType)
+ };
switch (amount) {
- case -7: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType))));
- case -6: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType))));
- case -5: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType))));
+ case -7: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)));
+ case -6: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)));
+ case -5: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)));
case -4: return M256::create(_mm_setzero_ps(), d.v()[0]);
- case -3: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType))), _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType))));
- case -2: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType))), _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType))));
- case -1: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType))), _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType))));
+ case -3: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)));
+ case -2: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)));
+ case -1: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)), _mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)));
case 0: return *this;
- case 1: return M256::create(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType))), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * sizeof(EntryType))));
- case 2: return M256::create(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType))), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * sizeof(EntryType))));
- case 3: return M256::create(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType))), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * sizeof(EntryType))));
+ case 1: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * EntryTypeSizeof)));
+ case 2: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * EntryTypeSizeof)));
+ case 3: return M256::create(_mm_castsi128_ps(mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * EntryTypeSizeof)), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * EntryTypeSizeof)));
case 4: return M256::create(d.v()[1], _mm_setzero_ps());
- case 5: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * sizeof(EntryType))), _mm_setzero_ps());
- case 6: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * sizeof(EntryType))), _mm_setzero_ps());
- case 7: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * sizeof(EntryType))), _mm_setzero_ps());
+ case 5: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * EntryTypeSizeof)), _mm_setzero_ps());
+ case 6: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * EntryTypeSizeof)), _mm_setzero_ps());
+ case 7: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * EntryTypeSizeof)), _mm_setzero_ps());
}
return Zero();
}
template<typename T> Vc_INTRINSIC Vc_PURE Vector<T> Vector<T>::rotated(int amount) const
{
+ enum {
+ EntryTypeSizeof = sizeof(EntryType)
+ };
const __m128i v = mm128_reinterpret_cast<__m128i>(d.v());
switch (static_cast<unsigned int>(amount) % Size) {
case 0: return *this;
- case 1: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 1 * sizeof(EntryType)));
- case 2: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 2 * sizeof(EntryType)));
- case 3: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 3 * sizeof(EntryType)));
+ case 1: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 1 * EntryTypeSizeof));
+ case 2: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 2 * EntryTypeSizeof));
+ case 3: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 3 * EntryTypeSizeof));
// warning "Immediate parameter to intrinsic call too large" disabled in VcMacros.cmake.
// ICC fails to see that the modulo operation (Size == sizeof(VectorType) / sizeof(EntryType))
// disables the following four calls unless sizeof(EntryType) == 2.
- case 4: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 4 * sizeof(EntryType)));
- case 5: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 5 * sizeof(EntryType)));
- case 6: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 6 * sizeof(EntryType)));
- case 7: return mm128_reinterpret_cast<VectorType>(_mm_alignr_epi8(v, v, 7 * sizeof(EntryType)));
+ case 4: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 4 * EntryTypeSizeof));
+ case 5: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 5 * EntryTypeSizeof));
+ case 6: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 6 * EntryTypeSizeof));
+ case 7: return mm128_reinterpret_cast<VectorType>(mm_alignr_epi8(v, v, 7 * EntryTypeSizeof));
}
return Zero();
}
template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::rotated(int amount) const
{
+ enum {
+ EntryTypeSizeof = sizeof(EntryType)
+ };
const __m128i v0 = sse_cast<__m128i>(d.v()[0]);
const __m128i v1 = sse_cast<__m128i>(d.v()[1]);
switch (static_cast<unsigned int>(amount) % Size) {
case 0: return *this;
- case 1: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))));
- case 2: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))));
- case 3: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))));
+ case 1: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 1 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 1 * EntryTypeSizeof)));
+ case 2: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 2 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 2 * EntryTypeSizeof)));
+ case 3: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v1, v0, 3 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v0, v1, 3 * EntryTypeSizeof)));
case 4: return M256::create(d.v()[1], d.v()[0]);
- case 5: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))));
- case 6: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))));
- case 7: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))));
+ case 5: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 1 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 1 * EntryTypeSizeof)));
+ case 6: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 2 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 2 * EntryTypeSizeof)));
+ case 7: return M256::create(sse_cast<__m128>(mm_alignr_epi8(v0, v1, 3 * EntryTypeSizeof)), sse_cast<__m128>(mm_alignr_epi8(v1, v0, 3 * EntryTypeSizeof)));
}
return Zero();
}
{
__m128i x = data();
__m128i y = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
- __m128i l = _mm_min_epu32(x, y);
- __m128i h = _mm_max_epu32(x, y);
+ __m128i l = mm_min_epu32(x, y);
+ __m128i h = mm_max_epu32(x, y);
x = _mm_unpacklo_epi32(l, h);
y = _mm_unpackhi_epi32(h, l);
// sort quads
- l = _mm_min_epu32(x, y);
- h = _mm_max_epu32(x, y);
+ l = mm_min_epu32(x, y);
+ h = mm_max_epu32(x, y);
x = _mm_unpacklo_epi32(l, h);
y = _mm_unpackhi_epi64(x, x);
- l = _mm_min_epu32(x, y);
- h = _mm_max_epu32(x, y);
+ l = mm_min_epu32(x, y);
+ h = mm_max_epu32(x, y);
return _mm_unpacklo_epi32(l, h);
}
template<> inline Vc_PURE ushort_v ushort_v::sorted() const
__m128i lo, hi, y, x = data();
// sort pairs
y = Mem::permute<X1, X0, X3, X2, X5, X4, X7, X6>(x);
- lo = _mm_min_epu16(x, y);
- hi = _mm_max_epu16(x, y);
- x = _mm_blend_epi16(lo, hi, 0xaa);
+ lo = mm_min_epu16(x, y);
+ hi = mm_max_epu16(x, y);
+ x = mm_blend_epi16(lo, hi, 0xaa);
// merge left and right quads
y = Mem::permute<X3, X2, X1, X0, X7, X6, X5, X4>(x);
- lo = _mm_min_epu16(x, y);
- hi = _mm_max_epu16(x, y);
- x = _mm_blend_epi16(lo, hi, 0xcc);
+ lo = mm_min_epu16(x, y);
+ hi = mm_max_epu16(x, y);
+ x = mm_blend_epi16(lo, hi, 0xcc);
y = _mm_srli_si128(x, 2);
- lo = _mm_min_epu16(x, y);
- hi = _mm_max_epu16(x, y);
- x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa);
+ lo = mm_min_epu16(x, y);
+ hi = mm_max_epu16(x, y);
+ x = mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa);
// merge quads into octs
y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3));
- lo = _mm_min_epu16(x, y);
- hi = _mm_max_epu16(x, y);
+ lo = mm_min_epu16(x, y);
+ hi = mm_max_epu16(x, y);
x = _mm_unpacklo_epi16(lo, hi);
y = _mm_srli_si128(x, 8);
- lo = _mm_min_epu16(x, y);
- hi = _mm_max_epu16(x, y);
+ lo = mm_min_epu16(x, y);
+ hi = mm_max_epu16(x, y);
x = _mm_unpacklo_epi16(lo, hi);
y = _mm_srli_si128(x, 8);
- lo = _mm_min_epu16(x, y);
- hi = _mm_max_epu16(x, y);
+ lo = mm_min_epu16(x, y);
+ hi = mm_max_epu16(x, y);
return _mm_unpacklo_epi16(lo, hi);
}