1 /* This file is part of the Vc library.
3 Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
23 #include "intrinsics.h"
24 #include "../common/bitscanintrinsics.h"
33 template<unsigned int VectorSize> class Mask<VectorSize, 32u>
35 friend class Mask<4u, 32u>; // double_v
36 friend class Mask<8u, 32u>; // float_v, (u)int_v
37 friend class Mask<8u, 16u>; // (u)short_v
38 friend class Mask<16u, 16u>; // (u)char_v
40 FREE_STORE_OPERATORS_ALIGNED(32)
42 // abstracts the way Masks are passed to functions, it can easily be changed to const ref here
43 #if defined VC_MSVC && defined _WIN32
44 typedef const Mask<VectorSize, 32u> &AsArg;
46 typedef Mask<VectorSize, 32u> AsArg;
49 Vc_ALWAYS_INLINE Mask() {}
50 Vc_ALWAYS_INLINE Mask(param256 x) : k(x) {}
51 Vc_ALWAYS_INLINE Mask(param256d x) : k(_mm256_castpd_ps(x)) {}
52 Vc_ALWAYS_INLINE Mask(param256i x) : k(_mm256_castsi256_ps(x)) {}
53 #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
54 Vc_ALWAYS_INLINE Mask(__m256 x) : k(x) {}
55 Vc_ALWAYS_INLINE Mask(__m256d x) : k(_mm256_castpd_ps(x)) {}
56 Vc_ALWAYS_INLINE Mask(__m256i x) : k(_mm256_castsi256_ps(x)) {}
58 Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : k(_mm256_setzero_ps()) {}
59 Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : k(_mm256_setallone_ps()) {}
60 Vc_ALWAYS_INLINE explicit Mask(bool b) : k(b ? _mm256_setallone_ps() : m256(_mm256_setzero_ps())) {}
61 Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(rhs.k) {}
62 Vc_ALWAYS_INLINE Mask(const Mask<VectorSize, 16u> &rhs) : k(avx_cast<m256>(concat(
63 _mm_unpacklo_epi16(rhs.dataI(), rhs.dataI()),
64 _mm_unpackhi_epi16(rhs.dataI(), rhs.dataI())))) {}
65 Vc_ALWAYS_INLINE_L Mask(const Mask<VectorSize * 2, 32u> &m) Vc_ALWAYS_INLINE_R;
66 Vc_ALWAYS_INLINE_L Mask(const Mask<VectorSize / 2, 32u> &m) Vc_ALWAYS_INLINE_R;
68 Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return 0 != _mm256_testc_ps(k, rhs.k); }
69 Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return 0 == _mm256_testc_ps(k, rhs.k); }
71 Vc_ALWAYS_INLINE Mask operator!() const { return _mm256_andnot_ps(data(), _mm256_setallone_ps()); }
73 Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { k = _mm256_and_ps(k, rhs.k); return *this; }
74 Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { k = _mm256_or_ps (k, rhs.k); return *this; }
75 Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { k = _mm256_xor_ps(k, rhs.k); return *this; }
77 // no need for expression template optimizations because cmp(n)eq for floats are not bitwise
79 Vc_ALWAYS_INLINE bool isFull () const { return 0 != _mm256_testc_ps(k, _mm256_setallone_ps()); }
80 Vc_ALWAYS_INLINE bool isEmpty() const { return 0 != _mm256_testz_ps(k, k); }
81 Vc_ALWAYS_INLINE bool isMix () const { return 0 != _mm256_testnzc_ps(k, _mm256_setallone_ps()); }
83 #ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK
84 Vc_ALWAYS_INLINE operator bool() const { return isFull(); }
87 Vc_ALWAYS_INLINE_L Vc_PURE_L int shiftMask() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
88 Vc_ALWAYS_INLINE_L Vc_PURE_L int toInt() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
90 Vc_ALWAYS_INLINE m256 data () const { return k; }
91 Vc_ALWAYS_INLINE m256i dataI() const { return _mm256_castps_si256(k); }
92 Vc_ALWAYS_INLINE m256d dataD() const { return _mm256_castps_pd(k); }
94 Vc_ALWAYS_INLINE_L Vc_PURE_L bool operator[](int index) const Vc_ALWAYS_INLINE_R Vc_PURE_R;
96 Vc_ALWAYS_INLINE_L Vc_PURE_L int count() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
97 Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
100 #ifdef VC_COMPILE_BENCHMARKS
106 template<unsigned int VectorSize> class Mask<VectorSize, 16u>
108 friend class Mask<4u, 32u>; // double_v
109 friend class Mask<8u, 32u>; // float_v, (u)int_v
110 friend class Mask<8u, 16u>; // (u)short_v
111 friend class Mask<16u, 16u>; // (u)char_v
113 FREE_STORE_OPERATORS_ALIGNED(16)
115 // abstracts the way Masks are passed to functions, it can easily be changed to const ref here
116 #if defined VC_MSVC && defined _WIN32
117 typedef const Mask<VectorSize, 16u> &AsArg;
119 typedef Mask<VectorSize, 16u> AsArg;
122 Vc_ALWAYS_INLINE Mask() {}
123 Vc_ALWAYS_INLINE Mask(param128 x) : k(x) {}
124 Vc_ALWAYS_INLINE Mask(param128d x) : k(_mm_castpd_ps(x)) {}
125 Vc_ALWAYS_INLINE Mask(param128i x) : k(_mm_castsi128_ps(x)) {}
126 #ifdef VC_UNCONDITIONAL_AVX2_INTRINSICS
127 Vc_ALWAYS_INLINE Mask(__m128 x) : k(x) {}
128 Vc_ALWAYS_INLINE Mask(__m128d x) : k(_mm_castpd_ps(x)) {}
129 Vc_ALWAYS_INLINE Mask(__m128i x) : k(_mm_castsi128_ps(x)) {}
131 Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerZero::ZEnum) : k(_mm_setzero_ps()) {}
132 Vc_ALWAYS_INLINE explicit Mask(VectorSpecialInitializerOne::OEnum) : k(_mm_setallone_ps()) {}
133 Vc_ALWAYS_INLINE explicit Mask(bool b) : k(b ? _mm_setallone_ps() : m128(_mm_setzero_ps())) {}
134 Vc_ALWAYS_INLINE Mask(const Mask &rhs) : k(rhs.k) {}
135 Vc_ALWAYS_INLINE Mask(const Mask<VectorSize, 32u> &rhs) : k(avx_cast<m128>(
136 _mm_packs_epi32(avx_cast<m128i>(rhs.data()), _mm256_extractf128_si256(rhs.dataI(), 1)))) {}
137 Vc_ALWAYS_INLINE Mask(const Mask<VectorSize / 2, 16u> *a) : k(avx_cast<m128>(
138 _mm_packs_epi16(a[0].dataI(), a[1].dataI()))) {}
140 Vc_ALWAYS_INLINE bool operator==(const Mask &rhs) const { return 0 != _mm_testc_si128(dataI(), rhs.dataI()); }
141 Vc_ALWAYS_INLINE bool operator!=(const Mask &rhs) const { return 0 == _mm_testc_si128(dataI(), rhs.dataI()); }
143 Vc_ALWAYS_INLINE Mask operator!() const { return _mm_andnot_ps(data(), _mm_setallone_ps()); }
145 Vc_ALWAYS_INLINE Mask &operator&=(const Mask &rhs) { k = _mm_and_ps(k, rhs.k); return *this; }
146 Vc_ALWAYS_INLINE Mask &operator|=(const Mask &rhs) { k = _mm_or_ps (k, rhs.k); return *this; }
147 Vc_ALWAYS_INLINE Mask &operator^=(const Mask &rhs) { k = _mm_xor_ps(k, rhs.k); return *this; }
149 // TODO: use expression templates to optimize (v1 == v2).isFull() and friends
150 Vc_ALWAYS_INLINE bool isFull () const { return 0 != _mm_testc_si128(dataI(), _mm_setallone_si128()); }
151 Vc_ALWAYS_INLINE bool isEmpty() const { return 0 != _mm_testz_si128(dataI(), dataI()); }
152 Vc_ALWAYS_INLINE bool isMix () const { return 0 != _mm_testnzc_si128(dataI(), _mm_setallone_si128()); }
154 #ifndef VC_NO_AUTOMATIC_BOOL_FROM_MASK
155 Vc_ALWAYS_INLINE operator bool() const { return isFull(); }
158 Vc_ALWAYS_INLINE_L Vc_PURE_L int shiftMask() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
159 Vc_ALWAYS_INLINE_L Vc_PURE_L int toInt() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
161 Vc_ALWAYS_INLINE m128 data () const { return k; }
162 Vc_ALWAYS_INLINE m128i dataI() const { return avx_cast<m128i>(k); }
163 Vc_ALWAYS_INLINE m128d dataD() const { return avx_cast<m128d>(k); }
165 Vc_ALWAYS_INLINE_L Vc_PURE_L bool operator[](int index) const Vc_ALWAYS_INLINE_R Vc_PURE_R;
167 Vc_ALWAYS_INLINE_L Vc_PURE_L int count() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
168 Vc_ALWAYS_INLINE_L Vc_PURE_L int firstOne() const Vc_ALWAYS_INLINE_R Vc_PURE_R;
171 #ifdef VC_COMPILE_BENCHMARKS
182 Vc_ALWAYS_INLINE ForeachHelper(size_t _mask) : mask(_mask), brk(false), outerBreak(false) {}
183 Vc_ALWAYS_INLINE bool outer() const { return mask != 0 && !outerBreak; }
184 Vc_ALWAYS_INLINE bool inner() { return (brk = !brk); }
185 Vc_ALWAYS_INLINE void noBreak() { outerBreak = false; }
186 Vc_ALWAYS_INLINE size_t next() {
189 const size_t bit = __builtin_ctzl(mask);
190 __asm__("btr %1,%0" : "+r"(mask) : "r"(bit));
193 #pragma warning(suppress : 4267) // conversion from 'size_t' to 'unsigned long', possible loss of data
195 const size_t bit = _bit_scan_forward(mask);
202 #define Vc_foreach_bit(_it_, _mask_) \
203 for (Vc::AVX::ForeachHelper Vc__make_unique(foreach_bit_obj)((_mask_).toInt()); Vc__make_unique(foreach_bit_obj).outer(); ) \
204 for (_it_ = Vc__make_unique(foreach_bit_obj).next(); Vc__make_unique(foreach_bit_obj).inner(); Vc__make_unique(foreach_bit_obj).noBreak())
209 static Vc_ALWAYS_INLINE Vc_PURE m256 and_(param256 a, param256 b) { return _mm256_and_ps(a, b); }
210 static Vc_ALWAYS_INLINE Vc_PURE m256 or_(param256 a, param256 b) { return _mm256_or_ps(a, b); }
211 static Vc_ALWAYS_INLINE Vc_PURE m256 xor_(param256 a, param256 b) { return _mm256_xor_ps(a, b); }
213 static Vc_ALWAYS_INLINE Vc_PURE m128 and_(param128 a, param128 b) { return _mm_and_ps(a, b); }
214 static Vc_ALWAYS_INLINE Vc_PURE m128 or_(param128 a, param128 b) { return _mm_or_ps(a, b); }
215 static Vc_ALWAYS_INLINE Vc_PURE m128 xor_(param128 a, param128 b) { return _mm_xor_ps(a, b); }
216 } // namespace Intrinsics
218 // binary and/or/xor cannot work with one operand larger than the other
219 template<unsigned int LSize, unsigned int RSize, size_t LWidth, size_t RWidth> void operator&(const Mask<LSize, LWidth> &l, const Mask<RSize, RWidth> &r);
220 template<unsigned int LSize, unsigned int RSize, size_t LWidth, size_t RWidth> void operator|(const Mask<LSize, LWidth> &l, const Mask<RSize, RWidth> &r);
221 template<unsigned int LSize, unsigned int RSize, size_t LWidth, size_t RWidth> void operator^(const Mask<LSize, LWidth> &l, const Mask<RSize, RWidth> &r);
223 // let binary and/or/xor work for any combination of masks (as long as they have the same sizeof)
224 template<unsigned int LSize, unsigned int RSize, size_t Width> Vc_ALWAYS_INLINE Vc_PURE Mask<LSize, Width> operator&(const Mask<LSize, Width> &l, const Mask<RSize, Width> &r) { return Intrinsics::and_(l.data(), r.data()); }
225 template<unsigned int LSize, unsigned int RSize, size_t Width> Vc_ALWAYS_INLINE Vc_PURE Mask<LSize, Width> operator|(const Mask<LSize, Width> &l, const Mask<RSize, Width> &r) { return Intrinsics:: or_(l.data(), r.data()); }
226 template<unsigned int LSize, unsigned int RSize, size_t Width> Vc_ALWAYS_INLINE Vc_PURE Mask<LSize, Width> operator^(const Mask<LSize, Width> &l, const Mask<RSize, Width> &r) { return Intrinsics::xor_(l.data(), r.data()); }
228 // disable logical and/or for incompatible masks
229 template<unsigned int LSize, unsigned int RSize, size_t LWidth, size_t RWidth> void operator&&(const Mask<LSize, LWidth> &lhs, const Mask<RSize, RWidth> &rhs);
230 template<unsigned int LSize, unsigned int RSize, size_t LWidth, size_t RWidth> void operator||(const Mask<LSize, LWidth> &lhs, const Mask<RSize, RWidth> &rhs);
232 // logical and/or for compatible masks
233 template<unsigned int Size, size_t LWidth, size_t RWidth> Vc_ALWAYS_INLINE Vc_PURE Mask<Size, LWidth> operator&&(const Mask<Size, LWidth> &lhs, const Mask<Size, RWidth> &rhs) { return lhs && static_cast<Mask<Size, LWidth> >(rhs); }
234 template<unsigned int Size, size_t LWidth, size_t RWidth> Vc_ALWAYS_INLINE Vc_PURE Mask<Size, LWidth> operator||(const Mask<Size, LWidth> &lhs, const Mask<Size, RWidth> &rhs) { return lhs || static_cast<Mask<Size, LWidth> >(rhs); }
236 template<unsigned int Size, size_t Width> Vc_ALWAYS_INLINE Vc_PURE Mask<Size, Width> operator&&(const Mask<Size, Width> &lhs, const Mask<Size, Width> &rhs) { return Intrinsics::and_(lhs.data(), rhs.data()); }
237 template<unsigned int Size, size_t Width> Vc_ALWAYS_INLINE Vc_PURE Mask<Size, Width> operator||(const Mask<Size, Width> &lhs, const Mask<Size, Width> &rhs) { return Intrinsics::or_ (lhs.data(), rhs.data()); }
241 } // namespace AliRoot
244 #include "undomacros.h"
246 #endif // VC_AVX_MASK_H