]>
Commit | Line | Data |
---|---|---|
c017a39f | 1 | /* This file is part of the Vc library. |
2 | ||
3 | Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org> | |
4 | ||
5 | Vc is free software: you can redistribute it and/or modify | |
6 | it under the terms of the GNU Lesser General Public License as | |
7 | published by the Free Software Foundation, either version 3 of | |
8 | the License, or (at your option) any later version. | |
9 | ||
10 | Vc is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with Vc. If not, see <http://www.gnu.org/licenses/>. | |
17 | ||
18 | */ | |
19 | ||
20 | #ifndef V_ALIGN | |
21 | # ifdef __GNUC__ | |
22 | # define V_ALIGN(n) __attribute__((aligned(n))) | |
23 | # else | |
24 | # define V_ALIGN(n) __declspec(align(n)) | |
25 | # endif | |
26 | #endif | |
27 | ||
28 | #include "Vc/avx/const_data.h" | |
29 | #include "Vc/sse/const_data.h" | |
30 | #include <Vc/version.h> | |
31 | ||
32 | #include <cstdio> | |
33 | #include <cstdlib> | |
34 | #include <cstring> | |
35 | ||
36 | #include "Vc/common/macros.h" | |
37 | ||
38 | namespace AliRoot { | |
39 | namespace Vc | |
40 | { | |
41 | namespace AVX | |
42 | { | |
43 | // cacheline 1 | |
44 | V_ALIGN(64) extern const unsigned int _IndexesFromZero32[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; | |
45 | V_ALIGN(16) extern const unsigned short _IndexesFromZero16[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; | |
46 | V_ALIGN(16) extern const unsigned char _IndexesFromZero8 [16]= { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; | |
47 | ||
48 | template<> const double c_trig<double>::data[] = { | |
49 | // cacheline 4 | |
50 | Vc_buildDouble(1, 0x921fb54442d18ull, -1), // π/4 | |
51 | Vc_buildDouble(1, 0x921fb40000000ull, -1), // π/4 - 30bits precision | |
52 | Vc_buildDouble(1, 0x4442d00000000ull, -25), // π/4 remainder1 - 32bits precision | |
53 | Vc_buildDouble(1, 0x8469898cc5170ull, -49), // π/4 remainder2 | |
54 | 0.0625, | |
55 | 16., | |
56 | 0., // padding | |
57 | 0., // padding | |
58 | // cacheline 5 | |
59 | Vc_buildDouble( 1, 0x555555555554bull, -5), // ~ 1/4! | |
60 | Vc_buildDouble(-1, 0x6c16c16c14f91ull, -10), // ~-1/6! | |
61 | Vc_buildDouble( 1, 0xa01a019c844f5ull, -16), // ~ 1/8! | |
62 | Vc_buildDouble(-1, 0x27e4f7eac4bc6ull, -22), // ~-1/10! | |
63 | Vc_buildDouble( 1, 0x1ee9d7b4e3f05ull, -29), // ~ 1/12! | |
64 | Vc_buildDouble(-1, 0x8fa49a0861a9bull, -37), // ~-1/14! | |
65 | Vc_buildDouble(-1, 0x5555555555548ull, -3), // ~-1/3! | |
66 | Vc_buildDouble( 1, 0x111111110f7d0ull, -7), // ~ 1/5! | |
67 | // cacheline 8 | |
68 | Vc_buildDouble(-1, 0xa01a019bfdf03ull, -13), // ~-1/7! | |
69 | Vc_buildDouble( 1, 0x71de3567d48a1ull, -19), // ~ 1/9! | |
70 | Vc_buildDouble(-1, 0xae5e5a9291f5dull, -26), // ~-1/11! | |
71 | Vc_buildDouble( 1, 0x5d8fd1fd19ccdull, -33), // ~ 1/13! | |
72 | 0., // padding (for alignment with float) | |
73 | Vc_buildDouble(1, 0x8BE60DB939105ull, 0), // 4/π | |
74 | Vc_buildDouble(1, 0x921fb54442d18ull, 0), // π/2 | |
75 | Vc_buildDouble(1, 0x921fb54442d18ull, 1), // π | |
76 | // cacheline 10 | |
77 | Vc_buildDouble(-1, 0xc007fa1f72594ull, -1), // atan P coefficients | |
78 | Vc_buildDouble(-1, 0x028545b6b807aull, 4), // atan P coefficients | |
79 | Vc_buildDouble(-1, 0x2c08c36880273ull, 6), // atan P coefficients | |
80 | Vc_buildDouble(-1, 0xeb8bf2d05ba25ull, 6), // atan P coefficients | |
81 | Vc_buildDouble(-1, 0x03669fd28ec8eull, 6), // atan P coefficients | |
82 | Vc_buildDouble( 1, 0x8dbc45b14603cull, 4), // atan Q coefficients | |
83 | Vc_buildDouble( 1, 0x4a0dd43b8fa25ull, 7), // atan Q coefficients | |
84 | Vc_buildDouble( 1, 0xb0e18d2e2be3bull, 8), // atan Q coefficients | |
85 | // cacheline 12 | |
86 | Vc_buildDouble( 1, 0xe563f13b049eaull, 8), // atan Q coefficients | |
87 | Vc_buildDouble( 1, 0x8519efbbd62ecull, 7), // atan Q coefficients | |
88 | Vc_buildDouble( 1, 0x3504f333f9de6ull, 1), // tan( 3/8 π ) | |
89 | 0.66, // lower threshold for special casing in atan | |
90 | Vc_buildDouble(1, 0x1A62633145C07ull, -54), // remainder of pi/2 | |
91 | 1.e-8, // small asin input threshold | |
92 | 0.625, // large asin input threshold | |
93 | 0., // padding | |
94 | // cacheline 14 | |
95 | Vc_buildDouble( 1, 0x84fc3988e9f08ull, -9), // asinCoeff0 | |
96 | Vc_buildDouble(-1, 0x2079259f9290full, -1), // asinCoeff0 | |
97 | Vc_buildDouble( 1, 0xbdff5baf33e6aull, 2), // asinCoeff0 | |
98 | Vc_buildDouble(-1, 0x991aaac01ab68ull, 4), // asinCoeff0 | |
99 | Vc_buildDouble( 1, 0xc896240f3081dull, 4), // asinCoeff0 | |
100 | Vc_buildDouble(-1, 0x5f2a2b6bf5d8cull, 4), // asinCoeff1 | |
101 | Vc_buildDouble( 1, 0x26219af6a7f42ull, 7), // asinCoeff1 | |
102 | Vc_buildDouble(-1, 0x7fe08959063eeull, 8), // asinCoeff1 | |
103 | // cacheline 16 | |
104 | Vc_buildDouble( 1, 0x56709b0b644beull, 8), // asinCoeff1 | |
105 | Vc_buildDouble( 1, 0x16b9b0bd48ad3ull, -8), // asinCoeff2 | |
106 | Vc_buildDouble(-1, 0x34341333e5c16ull, -1), // asinCoeff2 | |
107 | Vc_buildDouble( 1, 0x5c74b178a2dd9ull, 2), // asinCoeff2 | |
108 | Vc_buildDouble(-1, 0x04331de27907bull, 4), // asinCoeff2 | |
109 | Vc_buildDouble( 1, 0x39007da779259ull, 4), // asinCoeff2 | |
110 | Vc_buildDouble(-1, 0x0656c06ceafd5ull, 3), // asinCoeff2 | |
111 | Vc_buildDouble(-1, 0xd7b590b5e0eabull, 3), // asinCoeff3 | |
112 | // cacheline 18 | |
113 | Vc_buildDouble( 1, 0x19fc025fe9054ull, 6), // asinCoeff3 | |
114 | Vc_buildDouble(-1, 0x265bb6d3576d7ull, 7), // asinCoeff3 | |
115 | Vc_buildDouble( 1, 0x1705684ffbf9dull, 7), // asinCoeff3 | |
116 | Vc_buildDouble(-1, 0x898220a3607acull, 5), // asinCoeff3 | |
117 | }; | |
118 | #define _4(x) x | |
119 | template<> const float c_trig<float>::data[] = { | |
120 | // cacheline | |
121 | _4(Vc_buildFloat( 1, 0x490FDB, -1)), // π/4 | |
122 | _4(Vc_buildFloat( 1, 0x491000, -1)), // π/4 - 12 bits precision | |
123 | _4(Vc_buildFloat(-1, 0x157000, -19)), // π/4 remainder1 - 12 bits precision | |
124 | _4(Vc_buildFloat(-1, 0x6F4B9F, -32)), // π/4 remainder2 | |
125 | _4(0.0625f), | |
126 | _4(16.f), | |
127 | _4(0.f), // padding | |
128 | _4(0.f), // padding | |
129 | _4(4.166664568298827e-2f), // ~ 1/4! | |
130 | _4(-1.388731625493765e-3f), // ~-1/6! | |
131 | _4(2.443315711809948e-5f), // ~ 1/8! | |
132 | _4(0.f), // padding (for alignment with double) | |
133 | _4(0.f), // padding (for alignment with double) | |
134 | _4(0.f), // padding (for alignment with double) | |
135 | _4(-1.6666654611e-1f), // ~-1/3! | |
136 | _4(8.3321608736e-3f), // ~ 1/5! | |
137 | // cacheline | |
138 | _4(-1.9515295891e-4f), // ~-1/7! | |
139 | _4(0.f), // padding (for alignment with double) | |
140 | _4(0.f), // padding (for alignment with double) | |
141 | _4(0.f), // padding (for alignment with double) | |
142 | _4(8192.f), // loss threshold | |
143 | _4(Vc_buildFloat(1, 0x22F983, 0)), // 1.27323949337005615234375 = 4/π | |
144 | _4(Vc_buildFloat(1, 0x490FDB, 0)), // π/2 | |
145 | _4(Vc_buildFloat(1, 0x490FDB, 1)), // π | |
146 | _4(8.05374449538e-2f), // atan P coefficients | |
147 | _4(1.38776856032e-1f), // atan P coefficients | |
148 | _4(1.99777106478e-1f), // atan P coefficients | |
149 | _4(3.33329491539e-1f), // atan P coefficients | |
150 | _4(0.f), // padding (for alignment with double) | |
151 | _4(0.f), // padding (for alignment with double) | |
152 | _4(0.f), // padding (for alignment with double) | |
153 | _4(0.f), // padding (for alignment with double) | |
154 | // cacheline | |
155 | _4(0.f), // padding (for alignment with double) | |
156 | _4(0.f), // padding (for alignment with double) | |
157 | _4(2.414213562373095f), // tan( 3/8 π ) | |
158 | _4(0.414213562373095f), // tan( 1/8 π ) lower threshold for special casing in atan | |
159 | _4(Vc_buildFloat(-1, 0x3BBD2E, -25)), // remainder of pi/2 | |
160 | _4(1.e-4f), // small asin input threshold | |
161 | _4(0.f), // padding (for alignment with double) | |
162 | _4(0.f), // padding (for alignment with double) | |
163 | _4(4.2163199048e-2f), // asinCoeff0 | |
164 | _4(2.4181311049e-2f), // asinCoeff0 | |
165 | _4(4.5470025998e-2f), // asinCoeff0 | |
166 | _4(7.4953002686e-2f), // asinCoeff0 | |
167 | _4(1.6666752422e-1f), // asinCoeff0 | |
168 | _4(0.f), // padding (for alignment with double) | |
169 | _4(0.f), // padding (for alignment with double) | |
170 | _4(0.f), // padding (for alignment with double) | |
171 | // cacheline | |
172 | _4(0.f), // padding (for alignment with double) | |
173 | _4(0.f), // padding (for alignment with double) | |
174 | _4(0.f), // padding (for alignment with double) | |
175 | _4(0.f), // padding (for alignment with double) | |
176 | _4(0.f), // padding (for alignment with double) | |
177 | _4(0.f), // padding (for alignment with double) | |
178 | _4(0.f), // padding (for alignment with double) | |
179 | _4(0.f), // padding (for alignment with double) | |
180 | _4(0.f), // padding (for alignment with double) | |
181 | _4(0.f), // padding (for alignment with double) | |
182 | _4(0.f), // padding (for alignment with double) | |
183 | _4(0.f), // padding (for alignment with double) | |
184 | }; | |
185 | #undef _4 | |
186 | ||
187 | const unsigned int c_general::absMaskFloat[2] = { 0xffffffffu, 0x7fffffffu }; | |
188 | const unsigned int c_general::signMaskFloat[2] = { 0x0u, 0x80000000u }; | |
189 | const unsigned int c_general::highMaskFloat = 0xfffff000u; | |
190 | const float c_general::oneFloat = 1.f; | |
191 | const unsigned short c_general::minShort[2] = { 0x8000u, 0x8000u }; | |
192 | const unsigned short c_general::one16[2] = { 1, 1 }; | |
193 | const float c_general::_2power31 = 1u << 31; | |
194 | ||
195 | // cacheline 4 | |
196 | const unsigned long long c_general::highMaskDouble = 0xfffffffff8000000ull; | |
197 | const double c_general::oneDouble = 1.; | |
198 | const unsigned long long c_general::frexpMask = 0xbfefffffffffffffull; | |
199 | ||
200 | const unsigned long long c_log<double>::data[21] = { | |
201 | 0x000003ff000003ffull // bias TODO: remove | |
202 | , 0x7ff0000000000000ull // exponentMask (+inf) | |
203 | ||
204 | , 0x3f1ab4c293c31bb0ull // P[0] | |
205 | , 0x3fdfd6f53f5652f2ull // P[1] | |
206 | , 0x4012d2baed926911ull // P[2] | |
207 | , 0x402cff72c63eeb2eull // P[3] | |
208 | , 0x4031efd6924bc84dull // P[4] | |
209 | , 0x401ed5637d7edcf8ull // P[5] | |
210 | ||
211 | , 0x40269320ae97ef8eull // Q[0] | |
212 | , 0x40469d2c4e19c033ull // Q[1] | |
213 | , 0x4054bf33a326bdbdull // Q[2] | |
214 | , 0x4051c9e2eb5eae21ull // Q[3] | |
215 | , 0x4037200a9e1f25b2ull // Q[4] | |
216 | ||
217 | , 0xfff0000000000000ull // -inf | |
218 | , 0x0010000000000000ull // min() | |
219 | , 0x3fe6a09e667f3bcdull // 1/sqrt(2) | |
220 | , 0x3fe6300000000000ull // round(ln(2) * 512) / 512 | |
221 | , 0xbf2bd0105c610ca8ull // ln(2) - round(ln(2) * 512) / 512 | |
222 | , 0x3fe0000000000000ull // 0.5 | |
223 | , 0x3fdbcb7b1526e50eull // log10(e) | |
224 | , 0x3ff71547652b82feull // log2(e) | |
225 | }; | |
226 | ||
227 | template<> const unsigned int c_log<float>::data[21] = { | |
228 | 0x0000007fu // bias TODO: remove | |
229 | , 0x7f800000u // exponentMask (+inf) | |
230 | ||
231 | , 0x3d9021bbu // 7.0376836292e-2f // P[0] | |
232 | , 0xbdebd1b8u // -1.1514610310e-1f // P[1] | |
233 | , 0x3def251au // 1.1676998740e-1f // P[2] | |
234 | , 0xbdfe5d4fu // -1.2420140846e-1f // P[3] | |
235 | , 0x3e11e9bfu // 1.4249322787e-1f // P[4] | |
236 | , 0xbe2aae50u // -1.6668057665e-1f // P[5] | |
237 | , 0x3e4cceacu // 2.0000714765e-1f // P[6] | |
238 | , 0xbe7ffffcu // -2.4999993993e-1f // P[7] | |
239 | , 0x3eaaaaaau // 3.3333331174e-1f // P[8] | |
240 | , 0 // padding because of c_log<double> | |
241 | , 0 // padding because of c_log<double> | |
242 | ||
243 | , 0xff800000u // -inf | |
244 | , 0x00800000u // min() | |
245 | , 0x3f3504f3u // 1/sqrt(2) | |
246 | , 0x3f318000u // round(ln(2) * 512) / 512 | |
247 | , 0xb95e8083u // ln(2) - round(ln(2) * 512) / 512 | |
248 | , 0x3f000000u // 0.5 | |
249 | , 0x3ede5bd9u // log10(e) | |
250 | , 0x3fb8aa3bu // log2(e) | |
251 | }; | |
252 | } // namespace AVX | |
253 | ||
254 | namespace SSE | |
255 | { | |
256 | // cacheline 1 | |
257 | V_ALIGN(64) const int c_general::absMaskFloat[4] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; | |
258 | V_ALIGN(16) const unsigned int c_general::signMaskFloat[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; | |
259 | V_ALIGN(16) const unsigned int c_general::highMaskFloat[4] = { 0xfffff000u, 0xfffff000u, 0xfffff000u, 0xfffff000u }; | |
260 | V_ALIGN(16) const short c_general::minShort[8] = { -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000 }; | |
261 | V_ALIGN(16) extern const unsigned short _IndexesFromZero8[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; | |
262 | ||
263 | // cacheline 2 | |
264 | V_ALIGN(16) extern const unsigned int _IndexesFromZero4[4] = { 0, 1, 2, 3 }; | |
265 | V_ALIGN(16) const unsigned short c_general::one16[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; | |
266 | V_ALIGN(16) const unsigned int c_general::one32[4] = { 1, 1, 1, 1 }; | |
267 | V_ALIGN(16) const float c_general::oneFloat[4] = { 1.f, 1.f, 1.f, 1.f }; | |
268 | ||
269 | // cacheline 3 | |
270 | V_ALIGN(16) const unsigned long long c_general::highMaskDouble[2] = { 0xfffffffff8000000ull, 0xfffffffff8000000ull }; | |
271 | V_ALIGN(16) const double c_general::oneDouble[2] = { 1., 1. }; | |
272 | V_ALIGN(16) const long long c_general::absMaskDouble[2] = { 0x7fffffffffffffffll, 0x7fffffffffffffffll }; | |
273 | V_ALIGN(16) const unsigned long long c_general::signMaskDouble[2] = { 0x8000000000000000ull, 0x8000000000000000ull }; | |
274 | V_ALIGN(16) const unsigned long long c_general::frexpMask[2] = { 0xbfefffffffffffffull, 0xbfefffffffffffffull }; | |
275 | ||
276 | #define _2(x) x, x | |
277 | template<> const double c_trig<double>::data[] = { | |
278 | // cacheline 4 | |
279 | _2(Vc_buildDouble(1, 0x921fb54442d18ull, -1)), // π/4 | |
280 | _2(Vc_buildDouble(1, 0x921fb40000000ull, -1)), // π/4 - 30bits precision | |
281 | _2(Vc_buildDouble(1, 0x4442d00000000ull, -25)), // π/4 remainder1 - 32bits precision | |
282 | _2(Vc_buildDouble(1, 0x8469898cc5170ull, -49)), // π/4 remainder2 | |
283 | // cacheline 5 | |
284 | _2(0.0625), | |
285 | _2(16.), | |
286 | _2(0.), // padding | |
287 | _2(0.), // padding | |
288 | // cacheline 6 | |
289 | _2(Vc_buildDouble( 1, 0x555555555554bull, -5)), // ~ 1/4! | |
290 | _2(Vc_buildDouble(-1, 0x6c16c16c14f91ull, -10)), // ~-1/6! | |
291 | _2(Vc_buildDouble( 1, 0xa01a019c844f5ull, -16)), // ~ 1/8! | |
292 | _2(Vc_buildDouble(-1, 0x27e4f7eac4bc6ull, -22)), // ~-1/10! | |
293 | // cacheline 7 | |
294 | _2(Vc_buildDouble( 1, 0x1ee9d7b4e3f05ull, -29)), // ~ 1/12! | |
295 | _2(Vc_buildDouble(-1, 0x8fa49a0861a9bull, -37)), // ~-1/14! | |
296 | _2(Vc_buildDouble(-1, 0x5555555555548ull, -3)), // ~-1/3! | |
297 | _2(Vc_buildDouble( 1, 0x111111110f7d0ull, -7)), // ~ 1/5! | |
298 | // cacheline 8 | |
299 | _2(Vc_buildDouble(-1, 0xa01a019bfdf03ull, -13)), // ~-1/7! | |
300 | _2(Vc_buildDouble( 1, 0x71de3567d48a1ull, -19)), // ~ 1/9! | |
301 | _2(Vc_buildDouble(-1, 0xae5e5a9291f5dull, -26)), // ~-1/11! | |
302 | _2(Vc_buildDouble( 1, 0x5d8fd1fd19ccdull, -33)), // ~ 1/13! | |
303 | // cacheline 9 | |
304 | _2(0.), // padding (for alignment with float) | |
305 | _2(Vc_buildDouble(1, 0x8BE60DB939105ull, 0)), // 4/π | |
306 | _2(Vc_buildDouble(1, 0x921fb54442d18ull, 0)), // π/2 | |
307 | _2(Vc_buildDouble(1, 0x921fb54442d18ull, 1)), // π | |
308 | // cacheline 10 | |
309 | _2(Vc_buildDouble(-1, 0xc007fa1f72594ull, -1)), // atan P coefficients | |
310 | _2(Vc_buildDouble(-1, 0x028545b6b807aull, 4)), // atan P coefficients | |
311 | _2(Vc_buildDouble(-1, 0x2c08c36880273ull, 6)), // atan P coefficients | |
312 | _2(Vc_buildDouble(-1, 0xeb8bf2d05ba25ull, 6)), // atan P coefficients | |
313 | // cacheline 11 | |
314 | _2(Vc_buildDouble(-1, 0x03669fd28ec8eull, 6)), // atan P coefficients | |
315 | _2(Vc_buildDouble( 1, 0x8dbc45b14603cull, 4)), // atan Q coefficients | |
316 | _2(Vc_buildDouble( 1, 0x4a0dd43b8fa25ull, 7)), // atan Q coefficients | |
317 | _2(Vc_buildDouble( 1, 0xb0e18d2e2be3bull, 8)), // atan Q coefficients | |
318 | // cacheline 12 | |
319 | _2(Vc_buildDouble( 1, 0xe563f13b049eaull, 8)), // atan Q coefficients | |
320 | _2(Vc_buildDouble( 1, 0x8519efbbd62ecull, 7)), // atan Q coefficients | |
321 | _2(Vc_buildDouble( 1, 0x3504f333f9de6ull, 1)), // tan( 3/8 π ) | |
322 | _2(0.66), // lower threshold for special casing in atan | |
323 | // cacheline 13 | |
324 | _2(Vc_buildDouble(1, 0x1A62633145C07ull, -54)), // remainder of pi/2 | |
325 | _2(1.e-8), // small asin input threshold | |
326 | _2(0.625), // large asin input threshold | |
327 | _2(0.), // padding | |
328 | // cacheline 14 | |
329 | _2(Vc_buildDouble( 1, 0x84fc3988e9f08ull, -9)), // asinCoeff0 | |
330 | _2(Vc_buildDouble(-1, 0x2079259f9290full, -1)), // asinCoeff0 | |
331 | _2(Vc_buildDouble( 1, 0xbdff5baf33e6aull, 2)), // asinCoeff0 | |
332 | _2(Vc_buildDouble(-1, 0x991aaac01ab68ull, 4)), // asinCoeff0 | |
333 | // cacheline 15 | |
334 | _2(Vc_buildDouble( 1, 0xc896240f3081dull, 4)), // asinCoeff0 | |
335 | _2(Vc_buildDouble(-1, 0x5f2a2b6bf5d8cull, 4)), // asinCoeff1 | |
336 | _2(Vc_buildDouble( 1, 0x26219af6a7f42ull, 7)), // asinCoeff1 | |
337 | _2(Vc_buildDouble(-1, 0x7fe08959063eeull, 8)), // asinCoeff1 | |
338 | // cacheline 16 | |
339 | _2(Vc_buildDouble( 1, 0x56709b0b644beull, 8)), // asinCoeff1 | |
340 | _2(Vc_buildDouble( 1, 0x16b9b0bd48ad3ull, -8)), // asinCoeff2 | |
341 | _2(Vc_buildDouble(-1, 0x34341333e5c16ull, -1)), // asinCoeff2 | |
342 | _2(Vc_buildDouble( 1, 0x5c74b178a2dd9ull, 2)), // asinCoeff2 | |
343 | // cacheline 17 | |
344 | _2(Vc_buildDouble(-1, 0x04331de27907bull, 4)), // asinCoeff2 | |
345 | _2(Vc_buildDouble( 1, 0x39007da779259ull, 4)), // asinCoeff2 | |
346 | _2(Vc_buildDouble(-1, 0x0656c06ceafd5ull, 3)), // asinCoeff2 | |
347 | _2(Vc_buildDouble(-1, 0xd7b590b5e0eabull, 3)), // asinCoeff3 | |
348 | // cacheline 18 | |
349 | _2(Vc_buildDouble( 1, 0x19fc025fe9054ull, 6)), // asinCoeff3 | |
350 | _2(Vc_buildDouble(-1, 0x265bb6d3576d7ull, 7)), // asinCoeff3 | |
351 | _2(Vc_buildDouble( 1, 0x1705684ffbf9dull, 7)), // asinCoeff3 | |
352 | _2(Vc_buildDouble(-1, 0x898220a3607acull, 5)), // asinCoeff3 | |
353 | }; | |
354 | #undef _2 | |
355 | #define _4(x) x, x, x, x | |
356 | template<> const float c_trig<float>::data[] = { | |
357 | // cacheline | |
358 | _4(Vc_buildFloat( 1, 0x490FDB, -1)), // π/4 | |
359 | _4(Vc_buildFloat( 1, 0x491000, -1)), // π/4 - 12 bits precision | |
360 | _4(Vc_buildFloat(-1, 0x157000, -19)), // π/4 remainder1 - 12 bits precision | |
361 | _4(Vc_buildFloat(-1, 0x6F4B9F, -32)), // π/4 remainder2 | |
362 | // cacheline | |
363 | _4(0.0625f), | |
364 | _4(16.f), | |
365 | _4(0.f), // padding | |
366 | _4(0.f), // padding | |
367 | // cacheline | |
368 | _4(4.166664568298827e-2f), // ~ 1/4! | |
369 | _4(-1.388731625493765e-3f), // ~-1/6! | |
370 | _4(2.443315711809948e-5f), // ~ 1/8! | |
371 | _4(0.f), // padding (for alignment with double) | |
372 | // cacheline | |
373 | _4(0.f), // padding (for alignment with double) | |
374 | _4(0.f), // padding (for alignment with double) | |
375 | _4(-1.6666654611e-1f), // ~-1/3! | |
376 | _4(8.3321608736e-3f), // ~ 1/5! | |
377 | // cacheline | |
378 | _4(-1.9515295891e-4f), // ~-1/7! | |
379 | _4(0.f), // padding (for alignment with double) | |
380 | _4(0.f), // padding (for alignment with double) | |
381 | _4(0.f), // padding (for alignment with double) | |
382 | // cacheline | |
383 | _4(8192.f), // loss threshold | |
384 | _4(Vc_buildFloat(1, 0x22F983, 0)), // 1.27323949337005615234375 = 4/π | |
385 | _4(Vc_buildFloat(1, 0x490FDB, 0)), // π/2 | |
386 | _4(Vc_buildFloat(1, 0x490FDB, 1)), // π | |
387 | // cacheline | |
388 | _4(8.05374449538e-2f), // atan P coefficients | |
389 | _4(1.38776856032e-1f), // atan P coefficients | |
390 | _4(1.99777106478e-1f), // atan P coefficients | |
391 | _4(3.33329491539e-1f), // atan P coefficients | |
392 | // cacheline | |
393 | _4(0.f), // padding (for alignment with double) | |
394 | _4(0.f), // padding (for alignment with double) | |
395 | _4(0.f), // padding (for alignment with double) | |
396 | _4(0.f), // padding (for alignment with double) | |
397 | // cacheline | |
398 | _4(0.f), // padding (for alignment with double) | |
399 | _4(0.f), // padding (for alignment with double) | |
400 | _4(2.414213562373095f), // tan( 3/8 π ) | |
401 | _4(0.414213562373095f), // tan( 1/8 π ) lower threshold for special casing in atan | |
402 | // cacheline | |
403 | _4(Vc_buildFloat(-1, 0x3BBD2E, -25)), // remainder of pi/2 | |
404 | _4(1.e-4f), // small asin input threshold | |
405 | _4(0.f), // padding (for alignment with double) | |
406 | _4(0.f), // padding (for alignment with double) | |
407 | // cacheline | |
408 | _4(4.2163199048e-2f), // asinCoeff0 | |
409 | _4(2.4181311049e-2f), // asinCoeff0 | |
410 | _4(4.5470025998e-2f), // asinCoeff0 | |
411 | _4(7.4953002686e-2f), // asinCoeff0 | |
412 | // cacheline | |
413 | _4(1.6666752422e-1f), // asinCoeff0 | |
414 | _4(0.f), // padding (for alignment with double) | |
415 | _4(0.f), // padding (for alignment with double) | |
416 | _4(0.f), // padding (for alignment with double) | |
417 | // cacheline | |
418 | _4(0.f), // padding (for alignment with double) | |
419 | _4(0.f), // padding (for alignment with double) | |
420 | _4(0.f), // padding (for alignment with double) | |
421 | _4(0.f), // padding (for alignment with double) | |
422 | // cacheline | |
423 | _4(0.f), // padding (for alignment with double) | |
424 | _4(0.f), // padding (for alignment with double) | |
425 | _4(0.f), // padding (for alignment with double) | |
426 | _4(0.f), // padding (for alignment with double) | |
427 | // cacheline | |
428 | _4(0.f), // padding (for alignment with double) | |
429 | _4(0.f), // padding (for alignment with double) | |
430 | _4(0.f), // padding (for alignment with double) | |
431 | _4(0.f), // padding (for alignment with double) | |
432 | }; | |
433 | #undef _4 | |
434 | ||
435 | // cacheline 8 | |
436 | V_ALIGN(16) extern const unsigned char _IndexesFromZero16[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; | |
437 | ||
438 | V_ALIGN(64) const unsigned long long c_log<double>::data[21 * 2] = { | |
439 | /* 0*/ 0x000003ff000003ffull, 0x000003ff000003ffull // bias TODO: remove | |
440 | /* 1*/ , 0x7ff0000000000000ull, 0x7ff0000000000000ull // exponentMask (+inf) | |
441 | ||
442 | /* 2*/ , 0x3f1ab4c293c31bb0ull, 0x3f1ab4c293c31bb0ull // P[0] | |
443 | /* 3*/ , 0x3fdfd6f53f5652f2ull, 0x3fdfd6f53f5652f2ull // P[1] | |
444 | /* 4*/ , 0x4012d2baed926911ull, 0x4012d2baed926911ull // P[2] | |
445 | /* 5*/ , 0x402cff72c63eeb2eull, 0x402cff72c63eeb2eull // P[3] | |
446 | /* 6*/ , 0x4031efd6924bc84dull, 0x4031efd6924bc84dull // P[4] | |
447 | /* 7*/ , 0x401ed5637d7edcf8ull, 0x401ed5637d7edcf8ull // P[5] | |
448 | ||
449 | /* 8*/ , 0x40269320ae97ef8eull, 0x40269320ae97ef8eull // Q[0] | |
450 | /* 9*/ , 0x40469d2c4e19c033ull, 0x40469d2c4e19c033ull // Q[1] | |
451 | /*10*/ , 0x4054bf33a326bdbdull, 0x4054bf33a326bdbdull // Q[2] | |
452 | /*11*/ , 0x4051c9e2eb5eae21ull, 0x4051c9e2eb5eae21ull // Q[3] | |
453 | /*12*/ , 0x4037200a9e1f25b2ull, 0x4037200a9e1f25b2ull // Q[4] | |
454 | ||
455 | /*13*/ , 0xfff0000000000000ull, 0xfff0000000000000ull // -inf | |
456 | /*14*/ , 0x0010000000000000ull, 0x0010000000000000ull // min() | |
457 | /*15*/ , 0x3fe6a09e667f3bcdull, 0x3fe6a09e667f3bcdull // 1/sqrt(2) | |
458 | /*16*/ , 0x3fe6300000000000ull, 0x3fe6300000000000ull // round(ln(2) * 512) / 512 | |
459 | /*17*/ , 0xbf2bd0105c610ca8ull, 0xbf2bd0105c610ca8ull // ln(2) - round(ln(2) * 512) / 512 | |
460 | /*18*/ , 0x3fe0000000000000ull, 0x3fe0000000000000ull // 0.5 | |
461 | /*19*/ , 0x3fdbcb7b1526e50eull, 0x3fdbcb7b1526e50eull // log10(e) | |
462 | /*20*/ , 0x3ff71547652b82feull, 0x3ff71547652b82feull // log2(e) | |
463 | }; | |
464 | ||
465 | template<> V_ALIGN(64) const unsigned int c_log<float>::data[21 * 4] = { | |
466 | 0x0000007fu, 0x0000007fu, 0x0000007fu, 0x0000007fu, // bias TODO: remove | |
467 | 0x7f800000u, 0x7f800000u, 0x7f800000u, 0x7f800000u, // exponentMask (+inf) | |
468 | ||
469 | 0x3d9021bbu, 0x3d9021bbu, 0x3d9021bbu, 0x3d9021bbu, // 7.0376836292e-2f // P[0] | |
470 | 0xbdebd1b8u, 0xbdebd1b8u, 0xbdebd1b8u, 0xbdebd1b8u, // -1.1514610310e-1f // P[1] | |
471 | 0x3def251au, 0x3def251au, 0x3def251au, 0x3def251au, // 1.1676998740e-1f // P[2] | |
472 | 0xbdfe5d4fu, 0xbdfe5d4fu, 0xbdfe5d4fu, 0xbdfe5d4fu, // -1.2420140846e-1f // P[3] | |
473 | 0x3e11e9bfu, 0x3e11e9bfu, 0x3e11e9bfu, 0x3e11e9bfu, // 1.4249322787e-1f // P[4] | |
474 | 0xbe2aae50u, 0xbe2aae50u, 0xbe2aae50u, 0xbe2aae50u, // -1.6668057665e-1f // P[5] | |
475 | 0x3e4cceacu, 0x3e4cceacu, 0x3e4cceacu, 0x3e4cceacu, // 2.0000714765e-1f // P[6] | |
476 | 0xbe7ffffcu, 0xbe7ffffcu, 0xbe7ffffcu, 0xbe7ffffcu, // -2.4999993993e-1f // P[7] | |
477 | 0x3eaaaaaau, 0x3eaaaaaau, 0x3eaaaaaau, 0x3eaaaaaau, // 3.3333331174e-1f // P[8] | |
478 | 0, 0, 0, 0, // padding because of c_log<double> | |
479 | 0, 0, 0, 0, // padding because of c_log<double> | |
480 | ||
481 | 0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u, // -inf | |
482 | 0x00800000u, 0x00800000u, 0x00800000u, 0x00800000u, // min() | |
483 | 0x3f3504f3u, 0x3f3504f3u, 0x3f3504f3u, 0x3f3504f3u, // 1/sqrt(2) | |
484 | // ln(2) = 0x3fe62e42fefa39ef | |
485 | // ln(2) = Vc_buildDouble( 1, 0x00062e42fefa39ef, -1) | |
486 | // = Vc_buildFloat( 1, 0x00317217(f7d), -1) + Vc_buildFloat( 1, 0x0077d1cd, -25) | |
487 | // = Vc_buildFloat( 1, 0x00318000(000), -1) + Vc_buildFloat(-1, 0x005e8083, -13) | |
488 | 0x3f318000u, 0x3f318000u, 0x3f318000u, 0x3f318000u, // round(ln(2) * 512) / 512 | |
489 | 0xb95e8083u, 0xb95e8083u, 0xb95e8083u, 0xb95e8083u, // ln(2) - round(ln(2) * 512) / 512 | |
490 | 0x3f000000u, 0x3f000000u, 0x3f000000u, 0x3f000000u, // 0.5 | |
491 | 0x3ede5bd9u, 0x3ede5bd9u, 0x3ede5bd9u, 0x3ede5bd9u, // log10(e) | |
492 | 0x3fb8aa3bu, 0x3fb8aa3bu, 0x3fb8aa3bu, 0x3fb8aa3bu, // log2(e) | |
493 | // log10(2) = 0x3fd34413509f79ff | |
494 | // = Vc_buildDouble( 1, 0x00034413509f79ff, -2) | |
495 | // = Vc_buildFloat( 1, 0x001a209a(84fbcff8), -2) + Vc_buildFloat( 1, 0x0004fbcff(8), -26) | |
496 | //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) | |
497 | //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) | |
498 | //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) | |
499 | //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2) | |
500 | }; | |
501 | } // namespace SSE | |
502 | ||
503 | V_ALIGN(64) unsigned int RandomState[16] = { | |
504 | 0x5a383a4fu, 0xc68bd45eu, 0x691d6d86u, 0xb367e14fu, | |
505 | 0xd689dbaau, 0xfde442aau, 0x3d265423u, 0x1a77885cu, | |
506 | 0x36ed2684u, 0xfb1f049du, 0x19e52f31u, 0x821e4dd7u, | |
507 | 0x23996d25u, 0x5962725au, 0x6aced4ceu, 0xd4c610f3u | |
508 | }; | |
509 | ||
510 | // dummy symbol to emit warnings with GCC 4.3 | |
511 | namespace Warnings { | |
512 | void _operator_bracket_warning() {} | |
513 | } // namespace Warnings | |
514 | ||
515 | const char LIBRARY_VERSION[] = VC_VERSION_STRING; | |
516 | const unsigned int LIBRARY_VERSION_NUMBER = VC_VERSION_NUMBER; | |
517 | const unsigned int LIBRARY_ABI_VERSION = VC_LIBRARY_ABI_VERSION; | |
518 | ||
519 | void checkLibraryAbi(unsigned int compileTimeAbi, unsigned int versionNumber, const char *compileTimeVersion) { | |
520 | if (LIBRARY_ABI_VERSION != compileTimeAbi || LIBRARY_VERSION_NUMBER < versionNumber) { | |
521 | printf("The versions of libVc.a (%s) and Vc/version.h (%s) are incompatible. Aborting.\n", LIBRARY_VERSION, compileTimeVersion); | |
522 | abort(); | |
523 | } | |
524 | } | |
525 | ||
526 | } // namespace Vc | |
527 | } // namespace AliRoot | |
528 | ||
529 | #undef V_ALIGN |