]>
Commit | Line | Data |
---|---|---|
f22341db | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2009-2011 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
20 | #include "casts.h" | |
21 | #include <cstdlib> | |
22 | ||
c017a39f | 23 | namespace AliRoot { |
f22341db | 24 | namespace Vc |
25 | { | |
26 | namespace AVX | |
27 | { | |
28 | ||
29 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
30 | // float_v | |
31 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
32 | //// loads | |
c017a39f | 33 | template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper<m256>::load(const float *m, AlignedFlag) |
f22341db | 34 | { |
35 | return _mm256_load_ps(m); | |
36 | } | |
c017a39f | 37 | template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper<m256>::load(const float *m, UnalignedFlag) |
f22341db | 38 | { |
39 | return _mm256_loadu_ps(m); | |
40 | } | |
c017a39f | 41 | template<> Vc_ALWAYS_INLINE Vc_PURE m256 VectorHelper<m256>::load(const float *m, StreamingAndAlignedFlag) |
f22341db | 42 | { |
c017a39f | 43 | return avx_cast<m256>(concat(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(m))), |
f22341db | 44 | _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(&m[4]))))); |
45 | } | |
c017a39f | 46 | template<> Vc_ALWAYS_INLINE Vc_PURE m256 |
f22341db | 47 | VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") |
c017a39f | 48 | VectorHelper<m256>::load(const float *m, StreamingAndUnalignedFlag) |
f22341db | 49 | { |
50 | return _mm256_loadu_ps(m); | |
51 | } | |
52 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
53 | //// stores | |
c017a39f | 54 | Vc_ALWAYS_INLINE void VectorHelper<m256>::store(float *mem, VTArg x, AlignedFlag) |
f22341db | 55 | { |
56 | _mm256_store_ps(mem, x); | |
57 | } | |
c017a39f | 58 | Vc_ALWAYS_INLINE void VectorHelper<m256>::store(float *mem, VTArg x, UnalignedFlag) |
f22341db | 59 | { |
60 | _mm256_storeu_ps(mem, x); | |
61 | } | |
c017a39f | 62 | Vc_ALWAYS_INLINE void VectorHelper<m256>::store(float *mem, VTArg x, StreamingAndAlignedFlag) |
f22341db | 63 | { |
64 | _mm256_stream_ps(mem, x); | |
65 | } | |
c017a39f | 66 | Vc_ALWAYS_INLINE void VectorHelper<m256>::store(float *mem, VTArg x, StreamingAndUnalignedFlag) |
f22341db | 67 | { |
c017a39f | 68 | _mm_maskmoveu_si128(avx_cast<m128i>(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); |
69 | _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast<m256i>(x), 1), _mm_setallone_si128(), reinterpret_cast<char *>(mem + 4)); | |
f22341db | 70 | } |
c017a39f | 71 | Vc_ALWAYS_INLINE void VectorHelper<m256>::store(float *mem, VTArg x, VTArg m, AlignedFlag) |
f22341db | 72 | { |
73 | _mm256_maskstore(mem, m, x); | |
74 | } | |
c017a39f | 75 | Vc_ALWAYS_INLINE void VectorHelper<m256>::store(float *mem, VTArg x, VTArg m, UnalignedFlag) |
f22341db | 76 | { |
77 | _mm256_maskstore(mem, m, x); | |
78 | } | |
c017a39f | 79 | Vc_ALWAYS_INLINE void VectorHelper<m256>::store(float *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) |
f22341db | 80 | { |
c017a39f | 81 | _mm_maskmoveu_si128(avx_cast<m128i>(x), avx_cast<m128i>(m), reinterpret_cast<char *>(mem)); |
82 | _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast<m256i>(x), 1), _mm256_extractf128_si256(avx_cast<m256i>(m), 1), reinterpret_cast<char *>(mem + 4)); | |
f22341db | 83 | } |
c017a39f | 84 | Vc_ALWAYS_INLINE void VectorHelper<m256>::store(float *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) |
f22341db | 85 | { |
c017a39f | 86 | _mm_maskmoveu_si128(avx_cast<m128i>(x), avx_cast<m128i>(m), reinterpret_cast<char *>(mem)); |
87 | _mm_maskmoveu_si128(_mm256_extractf128_si256(avx_cast<m256i>(x), 1), _mm256_extractf128_si256(avx_cast<m256i>(m), 1), reinterpret_cast<char *>(mem + 4)); | |
f22341db | 88 | } |
89 | ||
90 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
91 | // double_v | |
92 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
93 | //// loads | |
c017a39f | 94 | template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper<m256d>::load(const double *m, AlignedFlag) |
f22341db | 95 | { |
96 | return _mm256_load_pd(m); | |
97 | } | |
c017a39f | 98 | template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper<m256d>::load(const double *m, UnalignedFlag) |
f22341db | 99 | { |
100 | return _mm256_loadu_pd(m); | |
101 | } | |
c017a39f | 102 | template<> Vc_ALWAYS_INLINE Vc_PURE m256d VectorHelper<m256d>::load(const double *m, StreamingAndAlignedFlag) |
f22341db | 103 | { |
c017a39f | 104 | return avx_cast<m256d>(concat( |
f22341db | 105 | _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(m))), |
106 | _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(&m[2]))))); | |
107 | } | |
c017a39f | 108 | template<> Vc_ALWAYS_INLINE Vc_PURE m256d |
f22341db | 109 | VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") |
c017a39f | 110 | VectorHelper<m256d>::load(const double *m, StreamingAndUnalignedFlag) |
f22341db | 111 | { |
112 | return _mm256_loadu_pd(m); | |
113 | } | |
114 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
115 | //// stores | |
c017a39f | 116 | Vc_ALWAYS_INLINE void VectorHelper<m256d>::store(double *mem, VTArg x, AlignedFlag) |
f22341db | 117 | { |
118 | _mm256_store_pd(mem, x); | |
119 | } | |
c017a39f | 120 | Vc_ALWAYS_INLINE void VectorHelper<m256d>::store(double *mem, VTArg x, UnalignedFlag) |
f22341db | 121 | { |
122 | _mm256_storeu_pd(mem, x); | |
123 | } | |
c017a39f | 124 | Vc_ALWAYS_INLINE void VectorHelper<m256d>::store(double *mem, VTArg x, StreamingAndAlignedFlag) |
f22341db | 125 | { |
126 | _mm256_stream_pd(mem, x); | |
127 | } | |
c017a39f | 128 | Vc_ALWAYS_INLINE void VectorHelper<m256d>::store(double *mem, VTArg x, StreamingAndUnalignedFlag) |
f22341db | 129 | { |
c017a39f | 130 | _mm_maskmoveu_si128(avx_cast<m128i>(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); |
131 | _mm_maskmoveu_si128(avx_cast<m128i>(_mm256_extractf128_pd(x, 1)), _mm_setallone_si128(), reinterpret_cast<char *>(mem + 2)); | |
f22341db | 132 | } |
c017a39f | 133 | Vc_ALWAYS_INLINE void VectorHelper<m256d>::store(double *mem, VTArg x, VTArg m, AlignedFlag) |
f22341db | 134 | { |
135 | _mm256_maskstore(mem, m, x); | |
136 | } | |
c017a39f | 137 | Vc_ALWAYS_INLINE void VectorHelper<m256d>::store(double *mem, VTArg x, VTArg m, UnalignedFlag) |
f22341db | 138 | { |
139 | _mm256_maskstore(mem, m, x); | |
140 | } | |
c017a39f | 141 | Vc_ALWAYS_INLINE void VectorHelper<m256d>::store(double *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) |
f22341db | 142 | { |
c017a39f | 143 | _mm_maskmoveu_si128(avx_cast<m128i>(x), avx_cast<m128i>(m), reinterpret_cast<char *>(mem)); |
144 | _mm_maskmoveu_si128(avx_cast<m128i>(_mm256_extractf128_pd(x, 1)), avx_cast<m128i>(_mm256_extractf128_pd(m, 1)), reinterpret_cast<char *>(mem + 2)); | |
f22341db | 145 | } |
c017a39f | 146 | Vc_ALWAYS_INLINE void VectorHelper<m256d>::store(double *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) |
f22341db | 147 | { |
c017a39f | 148 | _mm_maskmoveu_si128(avx_cast<m128i>(x), avx_cast<m128i>(m), reinterpret_cast<char *>(mem)); |
149 | _mm_maskmoveu_si128(avx_cast<m128i>(_mm256_extractf128_pd(x, 1)), avx_cast<m128i>(_mm256_extractf128_pd(m, 1)), reinterpret_cast<char *>(mem + 2)); | |
f22341db | 150 | } |
151 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
152 | // (u)int_v | |
153 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
154 | //// loads | |
c017a39f | 155 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper<m256i>::load(const T *m, AlignedFlag) |
f22341db | 156 | { |
157 | return _mm256_load_si256(reinterpret_cast<const __m256i *>(m)); | |
158 | } | |
c017a39f | 159 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper<m256i>::load(const T *m, UnalignedFlag) |
f22341db | 160 | { |
161 | return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(m)); | |
162 | } | |
c017a39f | 163 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE m256i VectorHelper<m256i>::load(const T *m, StreamingAndAlignedFlag) |
f22341db | 164 | { |
165 | return concat(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<T *>(m))), | |
166 | _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<T *>(&m[4])))); | |
167 | } | |
c017a39f | 168 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE m256i |
f22341db | 169 | VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") |
c017a39f | 170 | VectorHelper<m256i>::load(const T *m, StreamingAndUnalignedFlag) |
f22341db | 171 | { |
172 | return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(m)); | |
173 | } | |
174 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
175 | //// stores | |
c017a39f | 176 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m256i>::store(T *mem, VTArg x, AlignedFlag) |
f22341db | 177 | { |
c017a39f | 178 | _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); |
f22341db | 179 | } |
c017a39f | 180 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m256i>::store(T *mem, VTArg x, UnalignedFlag) |
f22341db | 181 | { |
c017a39f | 182 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); |
f22341db | 183 | } |
c017a39f | 184 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m256i>::store(T *mem, VTArg x, StreamingAndAlignedFlag) |
f22341db | 185 | { |
c017a39f | 186 | _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); |
f22341db | 187 | } |
c017a39f | 188 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m256i>::store(T *mem, VTArg x, StreamingAndUnalignedFlag) |
f22341db | 189 | { |
c017a39f | 190 | _mm_maskmoveu_si128(avx_cast<m128i>(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); |
f22341db | 191 | _mm_maskmoveu_si128(_mm256_extractf128_si256(x, 1), _mm_setallone_si128(), reinterpret_cast<char *>(mem + 4)); |
192 | } | |
c017a39f | 193 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m256i>::store(T *mem, VTArg x, VTArg m, AlignedFlag) |
f22341db | 194 | { |
195 | _mm256_maskstore(mem, m, x); | |
196 | } | |
c017a39f | 197 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m256i>::store(T *mem, VTArg x, VTArg m, UnalignedFlag) |
f22341db | 198 | { |
199 | _mm256_maskstore(mem, m, x); | |
200 | } | |
c017a39f | 201 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m256i>::store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) |
f22341db | 202 | { |
203 | _mm_maskmoveu_si128(lo128(x), lo128(m), reinterpret_cast<char *>(mem)); | |
204 | _mm_maskmoveu_si128(hi128(x), hi128(m), reinterpret_cast<char *>(mem + 4)); | |
205 | } | |
c017a39f | 206 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m256i>::store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) |
f22341db | 207 | { |
208 | _mm_maskmoveu_si128(lo128(x), lo128(m), reinterpret_cast<char *>(mem)); | |
209 | _mm_maskmoveu_si128(hi128(x), hi128(m), reinterpret_cast<char *>(mem + 4)); | |
210 | } | |
211 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
212 | // (u)short_v | |
213 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
214 | //// loads | |
c017a39f | 215 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper<m128i>::load(const T *m, AlignedFlag) |
f22341db | 216 | { |
217 | return _mm_load_si128(reinterpret_cast<const __m128i *>(m)); | |
218 | } | |
c017a39f | 219 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper<m128i>::load(const T *m, UnalignedFlag) |
f22341db | 220 | { |
221 | return _mm_loadu_si128(reinterpret_cast<const __m128i *>(m)); | |
222 | } | |
c017a39f | 223 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE m128i VectorHelper<m128i>::load(const T *m, StreamingAndAlignedFlag) |
f22341db | 224 | { |
225 | return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<T *>(m))); | |
226 | } | |
c017a39f | 227 | template<typename T> Vc_ALWAYS_INLINE Vc_PURE m128i |
f22341db | 228 | VC_WARN("AVX does not support streaming unaligned loads. Will use non-streaming unaligned load instead.") |
c017a39f | 229 | VectorHelper<m128i>::load(const T *m, StreamingAndUnalignedFlag) |
f22341db | 230 | { |
231 | return _mm_loadu_si128(reinterpret_cast<const __m128i *>(m)); | |
232 | } | |
233 | //////////////////////////////////////////////////////////////////////////////////////////////////// | |
234 | //// stores | |
c017a39f | 235 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m128i>::store(T *mem, VTArg x, AlignedFlag) |
f22341db | 236 | { |
c017a39f | 237 | _mm_store_si128(reinterpret_cast<__m128i *>(mem), x); |
f22341db | 238 | } |
c017a39f | 239 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m128i>::store(T *mem, VTArg x, UnalignedFlag) |
f22341db | 240 | { |
c017a39f | 241 | _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), x); |
f22341db | 242 | } |
c017a39f | 243 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m128i>::store(T *mem, VTArg x, StreamingAndAlignedFlag) |
f22341db | 244 | { |
c017a39f | 245 | _mm_stream_si128(reinterpret_cast<__m128i *>(mem), x); |
f22341db | 246 | } |
c017a39f | 247 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m128i>::store(T *mem, VTArg x, StreamingAndUnalignedFlag) |
f22341db | 248 | { |
249 | _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast<char *>(mem)); | |
250 | } | |
c017a39f | 251 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m128i>::store(T *mem, VTArg x, VTArg m, AlignedFlag align) |
f22341db | 252 | { |
253 | store(mem, _mm_blendv_epi8(load(mem, align), x, m), align); | |
254 | } | |
c017a39f | 255 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m128i>::store(T *mem, VTArg x, VTArg m, UnalignedFlag align) |
f22341db | 256 | { |
257 | store(mem, _mm_blendv_epi8(load(mem, align), x, m), align); | |
258 | } | |
c017a39f | 259 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m128i>::store(T *mem, VTArg x, VTArg m, StreamingAndAlignedFlag) |
f22341db | 260 | { |
261 | _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); | |
262 | } | |
c017a39f | 263 | template<typename T> Vc_ALWAYS_INLINE void VectorHelper<m128i>::store(T *mem, VTArg x, VTArg m, StreamingAndUnalignedFlag) |
f22341db | 264 | { |
265 | _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); | |
266 | } | |
267 | ||
268 | } // namespace AVX | |
269 | } // namespace Vc | |
c017a39f | 270 | } // namespace AliRoot |