Vc/src/const.cpp

   1 /*  This file is part of the Vc library.
   2
   3     Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
   4
   5     Vc is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU Lesser General Public License as
   7     published by the Free Software Foundation, either version 3 of
   8     the License, or (at your option) any later version.
   9
  10     Vc is distributed in the hope that it will be useful, but
  11     WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with Vc.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 */
  19
  20 #ifndef V_ALIGN
  21 # ifdef __GNUC__
  22 #  define V_ALIGN(n) __attribute__((aligned(n)))
  23 # else
  24 #  define V_ALIGN(n) __declspec(align(n))
  25 # endif
  26 #endif
  27
  28 #include "Vc/avx/const_data.h"
  29 #include "Vc/sse/const_data.h"
  30 #include <Vc/version.h>
  31
  32 #include <cstdio>
  33 #include <cstdlib>
  34 #include <cstring>
  35
  36 #include "Vc/common/macros.h"
  37
  38 namespace AliRoot {
  39 namespace Vc
  40 {
  41 namespace AVX
  42 {
  43     // cacheline 1
  44     V_ALIGN(64) extern const unsigned int   _IndexesFromZero32[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
  45     V_ALIGN(16) extern const unsigned short _IndexesFromZero16[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
  46     V_ALIGN(16) extern const unsigned char  _IndexesFromZero8 [16]= { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
  47
  48     template<> const double c_trig<double>::data[] = {
  49     // cacheline 4
  50         Vc_buildDouble(1, 0x921fb54442d18ull, -1), // π/4
  51         Vc_buildDouble(1, 0x921fb40000000ull, -1), // π/4 - 30bits precision
  52         Vc_buildDouble(1, 0x4442d00000000ull, -25), // π/4 remainder1 - 32bits precision
  53         Vc_buildDouble(1, 0x8469898cc5170ull, -49), // π/4 remainder2
  54         0.0625,
  55         16.,
  56         0., // padding
  57         0., // padding
  58     // cacheline 5
  59         Vc_buildDouble( 1, 0x555555555554bull,  -5), // ~ 1/4!
  60         Vc_buildDouble(-1, 0x6c16c16c14f91ull, -10), // ~-1/6!
  61         Vc_buildDouble( 1, 0xa01a019c844f5ull, -16), // ~ 1/8!
  62         Vc_buildDouble(-1, 0x27e4f7eac4bc6ull, -22), // ~-1/10!
  63         Vc_buildDouble( 1, 0x1ee9d7b4e3f05ull, -29), // ~ 1/12!
  64         Vc_buildDouble(-1, 0x8fa49a0861a9bull, -37), // ~-1/14!
  65         Vc_buildDouble(-1, 0x5555555555548ull,  -3), // ~-1/3!
  66         Vc_buildDouble( 1, 0x111111110f7d0ull,  -7), // ~ 1/5!
  67     // cacheline 8
  68         Vc_buildDouble(-1, 0xa01a019bfdf03ull, -13), // ~-1/7!
  69         Vc_buildDouble( 1, 0x71de3567d48a1ull, -19), // ~ 1/9!
  70         Vc_buildDouble(-1, 0xae5e5a9291f5dull, -26), // ~-1/11!
  71         Vc_buildDouble( 1, 0x5d8fd1fd19ccdull, -33), // ~ 1/13!
  72         0., // padding (for alignment with float)
  73         Vc_buildDouble(1, 0x8BE60DB939105ull,  0), // 4/π
  74         Vc_buildDouble(1, 0x921fb54442d18ull,  0), // π/2
  75         Vc_buildDouble(1, 0x921fb54442d18ull,  1), // π
  76     // cacheline 10
  77         Vc_buildDouble(-1, 0xc007fa1f72594ull, -1), // atan P coefficients
  78         Vc_buildDouble(-1, 0x028545b6b807aull,  4), // atan P coefficients
  79         Vc_buildDouble(-1, 0x2c08c36880273ull,  6), // atan P coefficients
  80         Vc_buildDouble(-1, 0xeb8bf2d05ba25ull,  6), // atan P coefficients
  81         Vc_buildDouble(-1, 0x03669fd28ec8eull,  6), // atan P coefficients
  82         Vc_buildDouble( 1, 0x8dbc45b14603cull,  4), // atan Q coefficients
  83         Vc_buildDouble( 1, 0x4a0dd43b8fa25ull,  7), // atan Q coefficients
  84         Vc_buildDouble( 1, 0xb0e18d2e2be3bull,  8), // atan Q coefficients
  85     // cacheline 12
  86         Vc_buildDouble( 1, 0xe563f13b049eaull,  8), // atan Q coefficients
  87         Vc_buildDouble( 1, 0x8519efbbd62ecull,  7), // atan Q coefficients
  88         Vc_buildDouble( 1, 0x3504f333f9de6ull,  1), // tan( 3/8 π )
  89         0.66,                                    // lower threshold for special casing in atan
  90         Vc_buildDouble(1, 0x1A62633145C07ull, -54), // remainder of pi/2
  91         1.e-8, // small asin input threshold
  92         0.625, // large asin input threshold
  93         0., // padding
  94     // cacheline 14
  95         Vc_buildDouble( 1, 0x84fc3988e9f08ull, -9), // asinCoeff0
  96         Vc_buildDouble(-1, 0x2079259f9290full, -1), // asinCoeff0
  97         Vc_buildDouble( 1, 0xbdff5baf33e6aull,  2), // asinCoeff0
  98         Vc_buildDouble(-1, 0x991aaac01ab68ull,  4), // asinCoeff0
  99         Vc_buildDouble( 1, 0xc896240f3081dull,  4), // asinCoeff0
 100         Vc_buildDouble(-1, 0x5f2a2b6bf5d8cull,  4), // asinCoeff1
 101         Vc_buildDouble( 1, 0x26219af6a7f42ull,  7), // asinCoeff1
 102         Vc_buildDouble(-1, 0x7fe08959063eeull,  8), // asinCoeff1
 103     // cacheline 16
 104         Vc_buildDouble( 1, 0x56709b0b644beull,  8), // asinCoeff1
 105         Vc_buildDouble( 1, 0x16b9b0bd48ad3ull, -8), // asinCoeff2
 106         Vc_buildDouble(-1, 0x34341333e5c16ull, -1), // asinCoeff2
 107         Vc_buildDouble( 1, 0x5c74b178a2dd9ull,  2), // asinCoeff2
 108         Vc_buildDouble(-1, 0x04331de27907bull,  4), // asinCoeff2
 109         Vc_buildDouble( 1, 0x39007da779259ull,  4), // asinCoeff2
 110         Vc_buildDouble(-1, 0x0656c06ceafd5ull,  3), // asinCoeff2
 111         Vc_buildDouble(-1, 0xd7b590b5e0eabull,  3), // asinCoeff3
 112     // cacheline 18
 113         Vc_buildDouble( 1, 0x19fc025fe9054ull,  6), // asinCoeff3
 114         Vc_buildDouble(-1, 0x265bb6d3576d7ull,  7), // asinCoeff3
 115         Vc_buildDouble( 1, 0x1705684ffbf9dull,  7), // asinCoeff3
 116         Vc_buildDouble(-1, 0x898220a3607acull,  5), // asinCoeff3
 117     };
 118 #define _4(x) x
 119     template<> const float c_trig<float>::data[] = {
 120     // cacheline
 121         _4(Vc_buildFloat( 1, 0x490FDB,  -1)), // π/4
 122         _4(Vc_buildFloat( 1, 0x491000,  -1)), // π/4 - 12 bits precision
 123         _4(Vc_buildFloat(-1, 0x157000, -19)), // π/4 remainder1 - 12 bits precision
 124         _4(Vc_buildFloat(-1, 0x6F4B9F, -32)), // π/4 remainder2
 125         _4(0.0625f),
 126         _4(16.f),
 127         _4(0.f), // padding
 128         _4(0.f), // padding
 129         _4(4.166664568298827e-2f),  // ~ 1/4!
 130         _4(-1.388731625493765e-3f), // ~-1/6!
 131         _4(2.443315711809948e-5f),  // ~ 1/8!
 132         _4(0.f), // padding (for alignment with double)
 133         _4(0.f), // padding (for alignment with double)
 134         _4(0.f), // padding (for alignment with double)
 135         _4(-1.6666654611e-1f), // ~-1/3!
 136         _4(8.3321608736e-3f),  // ~ 1/5!
 137     // cacheline
 138         _4(-1.9515295891e-4f), // ~-1/7!
 139         _4(0.f), // padding (for alignment with double)
 140         _4(0.f), // padding (for alignment with double)
 141         _4(0.f), // padding (for alignment with double)
 142         _4(8192.f), // loss threshold
 143         _4(Vc_buildFloat(1, 0x22F983, 0)), // 1.27323949337005615234375 = 4/π
 144         _4(Vc_buildFloat(1, 0x490FDB, 0)), // π/2
 145         _4(Vc_buildFloat(1, 0x490FDB, 1)), // π
 146         _4(8.05374449538e-2f), // atan P coefficients
 147         _4(1.38776856032e-1f), // atan P coefficients
 148         _4(1.99777106478e-1f), // atan P coefficients
 149         _4(3.33329491539e-1f), // atan P coefficients
 150         _4(0.f), // padding (for alignment with double)
 151         _4(0.f), // padding (for alignment with double)
 152         _4(0.f), // padding (for alignment with double)
 153         _4(0.f), // padding (for alignment with double)
 154     // cacheline
 155         _4(0.f), // padding (for alignment with double)
 156         _4(0.f), // padding (for alignment with double)
 157         _4(2.414213562373095f), // tan( 3/8 π )
 158         _4(0.414213562373095f), // tan( 1/8 π ) lower threshold for special casing in atan
 159         _4(Vc_buildFloat(-1, 0x3BBD2E, -25)), // remainder of pi/2
 160         _4(1.e-4f), // small asin input threshold
 161         _4(0.f), // padding (for alignment with double)
 162         _4(0.f), // padding (for alignment with double)
 163         _4(4.2163199048e-2f), // asinCoeff0
 164         _4(2.4181311049e-2f), // asinCoeff0
 165         _4(4.5470025998e-2f), // asinCoeff0
 166         _4(7.4953002686e-2f), // asinCoeff0
 167         _4(1.6666752422e-1f), // asinCoeff0
 168         _4(0.f), // padding (for alignment with double)
 169         _4(0.f), // padding (for alignment with double)
 170         _4(0.f), // padding (for alignment with double)
 171     // cacheline
 172         _4(0.f), // padding (for alignment with double)
 173         _4(0.f), // padding (for alignment with double)
 174         _4(0.f), // padding (for alignment with double)
 175         _4(0.f), // padding (for alignment with double)
 176         _4(0.f), // padding (for alignment with double)
 177         _4(0.f), // padding (for alignment with double)
 178         _4(0.f), // padding (for alignment with double)
 179         _4(0.f), // padding (for alignment with double)
 180         _4(0.f), // padding (for alignment with double)
 181         _4(0.f), // padding (for alignment with double)
 182         _4(0.f), // padding (for alignment with double)
 183         _4(0.f), // padding (for alignment with double)
 184     };
 185 #undef _4
 186
 187     const unsigned       int c_general::absMaskFloat[2] = { 0xffffffffu, 0x7fffffffu };
 188     const unsigned       int c_general::signMaskFloat[2] = { 0x0u, 0x80000000u };
 189     const unsigned       int c_general::highMaskFloat = 0xfffff000u;
 190     const              float c_general::oneFloat = 1.f;
 191     const unsigned     short c_general::minShort[2] = { 0x8000u, 0x8000u };
 192     const unsigned     short c_general::one16[2] = { 1, 1 };
 193     const              float c_general::_2power31 = 1u << 31;
 194
 195     // cacheline 4
 196     const unsigned long long c_general::highMaskDouble = 0xfffffffff8000000ull;
 197     const             double c_general::oneDouble = 1.;
 198     const unsigned long long c_general::frexpMask = 0xbfefffffffffffffull;
 199
 200     const unsigned long long c_log<double>::data[21] = {
 201         0x000003ff000003ffull // bias TODO: remove
 202       , 0x7ff0000000000000ull // exponentMask (+inf)
 203
 204       , 0x3f1ab4c293c31bb0ull // P[0]
 205       , 0x3fdfd6f53f5652f2ull // P[1]
 206       , 0x4012d2baed926911ull // P[2]
 207       , 0x402cff72c63eeb2eull // P[3]
 208       , 0x4031efd6924bc84dull // P[4]
 209       , 0x401ed5637d7edcf8ull // P[5]
 210
 211       , 0x40269320ae97ef8eull // Q[0]
 212       , 0x40469d2c4e19c033ull // Q[1]
 213       , 0x4054bf33a326bdbdull // Q[2]
 214       , 0x4051c9e2eb5eae21ull // Q[3]
 215       , 0x4037200a9e1f25b2ull // Q[4]
 216
 217       , 0xfff0000000000000ull // -inf
 218       , 0x0010000000000000ull // min()
 219       , 0x3fe6a09e667f3bcdull // 1/sqrt(2)
 220       , 0x3fe6300000000000ull // round(ln(2) * 512) / 512
 221       , 0xbf2bd0105c610ca8ull // ln(2) - round(ln(2) * 512) / 512
 222       , 0x3fe0000000000000ull // 0.5
 223       , 0x3fdbcb7b1526e50eull // log10(e)
 224       , 0x3ff71547652b82feull // log2(e)
 225     };
 226
 227     template<> const unsigned int c_log<float>::data[21] = {
 228         0x0000007fu // bias TODO: remove
 229       , 0x7f800000u // exponentMask (+inf)
 230
 231       , 0x3d9021bbu //  7.0376836292e-2f // P[0]
 232       , 0xbdebd1b8u // -1.1514610310e-1f // P[1]
 233       , 0x3def251au //  1.1676998740e-1f // P[2]
 234       , 0xbdfe5d4fu // -1.2420140846e-1f // P[3]
 235       , 0x3e11e9bfu //  1.4249322787e-1f // P[4]
 236       , 0xbe2aae50u // -1.6668057665e-1f // P[5]
 237       , 0x3e4cceacu //  2.0000714765e-1f // P[6]
 238       , 0xbe7ffffcu // -2.4999993993e-1f // P[7]
 239       , 0x3eaaaaaau //  3.3333331174e-1f // P[8]
 240       , 0           // padding because of c_log<double>
 241       , 0           // padding because of c_log<double>
 242
 243       , 0xff800000u // -inf
 244       , 0x00800000u // min()
 245       , 0x3f3504f3u // 1/sqrt(2)
 246       , 0x3f318000u // round(ln(2) * 512) / 512
 247       , 0xb95e8083u // ln(2) - round(ln(2) * 512) / 512
 248       , 0x3f000000u // 0.5
 249       , 0x3ede5bd9u // log10(e)
 250       , 0x3fb8aa3bu // log2(e)
 251     };
 252 } // namespace AVX
 253
 254 namespace SSE
 255 {
 256     // cacheline 1
 257     V_ALIGN(64) const int c_general::absMaskFloat[4] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
 258     V_ALIGN(16) const unsigned int c_general::signMaskFloat[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
 259     V_ALIGN(16) const unsigned int c_general::highMaskFloat[4] = { 0xfffff000u, 0xfffff000u, 0xfffff000u, 0xfffff000u };
 260     V_ALIGN(16) const short c_general::minShort[8] = { -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000 };
 261     V_ALIGN(16) extern const unsigned short _IndexesFromZero8[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
 262
 263     // cacheline 2
 264     V_ALIGN(16) extern const unsigned int   _IndexesFromZero4[4] = { 0, 1, 2, 3 };
 265     V_ALIGN(16) const unsigned short c_general::one16[8] = { 1, 1, 1, 1, 1, 1, 1, 1 };
 266     V_ALIGN(16) const unsigned int c_general::one32[4] = { 1, 1, 1, 1 };
 267     V_ALIGN(16) const float c_general::oneFloat[4] = { 1.f, 1.f, 1.f, 1.f };
 268
 269     // cacheline 3
 270     V_ALIGN(16) const unsigned long long c_general::highMaskDouble[2] = { 0xfffffffff8000000ull, 0xfffffffff8000000ull };
 271     V_ALIGN(16) const double c_general::oneDouble[2] = { 1., 1. };
 272     V_ALIGN(16) const long long c_general::absMaskDouble[2] = { 0x7fffffffffffffffll, 0x7fffffffffffffffll };
 273     V_ALIGN(16) const unsigned long long c_general::signMaskDouble[2] = { 0x8000000000000000ull, 0x8000000000000000ull };
 274     V_ALIGN(16) const unsigned long long c_general::frexpMask[2] = { 0xbfefffffffffffffull, 0xbfefffffffffffffull };
 275
 276 #define _2(x) x, x
 277     template<> const double c_trig<double>::data[] = {
 278     // cacheline 4
 279         _2(Vc_buildDouble(1, 0x921fb54442d18ull, -1)), // π/4
 280         _2(Vc_buildDouble(1, 0x921fb40000000ull, -1)), // π/4 - 30bits precision
 281         _2(Vc_buildDouble(1, 0x4442d00000000ull, -25)), // π/4 remainder1 - 32bits precision
 282         _2(Vc_buildDouble(1, 0x8469898cc5170ull, -49)), // π/4 remainder2
 283     // cacheline 5
 284         _2(0.0625),
 285         _2(16.),
 286         _2(0.), // padding
 287         _2(0.), // padding
 288     // cacheline 6
 289         _2(Vc_buildDouble( 1, 0x555555555554bull,  -5)), // ~ 1/4!
 290         _2(Vc_buildDouble(-1, 0x6c16c16c14f91ull, -10)), // ~-1/6!
 291         _2(Vc_buildDouble( 1, 0xa01a019c844f5ull, -16)), // ~ 1/8!
 292         _2(Vc_buildDouble(-1, 0x27e4f7eac4bc6ull, -22)), // ~-1/10!
 293     // cacheline 7
 294         _2(Vc_buildDouble( 1, 0x1ee9d7b4e3f05ull, -29)), // ~ 1/12!
 295         _2(Vc_buildDouble(-1, 0x8fa49a0861a9bull, -37)), // ~-1/14!
 296         _2(Vc_buildDouble(-1, 0x5555555555548ull,  -3)), // ~-1/3!
 297         _2(Vc_buildDouble( 1, 0x111111110f7d0ull,  -7)), // ~ 1/5!
 298     // cacheline 8
 299         _2(Vc_buildDouble(-1, 0xa01a019bfdf03ull, -13)), // ~-1/7!
 300         _2(Vc_buildDouble( 1, 0x71de3567d48a1ull, -19)), // ~ 1/9!
 301         _2(Vc_buildDouble(-1, 0xae5e5a9291f5dull, -26)), // ~-1/11!
 302         _2(Vc_buildDouble( 1, 0x5d8fd1fd19ccdull, -33)), // ~ 1/13!
 303     // cacheline 9
 304         _2(0.), // padding (for alignment with float)
 305         _2(Vc_buildDouble(1, 0x8BE60DB939105ull,  0)), // 4/π
 306         _2(Vc_buildDouble(1, 0x921fb54442d18ull,  0)), // π/2
 307         _2(Vc_buildDouble(1, 0x921fb54442d18ull,  1)), // π
 308     // cacheline 10
 309         _2(Vc_buildDouble(-1, 0xc007fa1f72594ull, -1)), // atan P coefficients
 310         _2(Vc_buildDouble(-1, 0x028545b6b807aull,  4)), // atan P coefficients
 311         _2(Vc_buildDouble(-1, 0x2c08c36880273ull,  6)), // atan P coefficients
 312         _2(Vc_buildDouble(-1, 0xeb8bf2d05ba25ull,  6)), // atan P coefficients
 313     // cacheline 11
 314         _2(Vc_buildDouble(-1, 0x03669fd28ec8eull,  6)), // atan P coefficients
 315         _2(Vc_buildDouble( 1, 0x8dbc45b14603cull,  4)), // atan Q coefficients
 316         _2(Vc_buildDouble( 1, 0x4a0dd43b8fa25ull,  7)), // atan Q coefficients
 317         _2(Vc_buildDouble( 1, 0xb0e18d2e2be3bull,  8)), // atan Q coefficients
 318     // cacheline 12
 319         _2(Vc_buildDouble( 1, 0xe563f13b049eaull,  8)), // atan Q coefficients
 320         _2(Vc_buildDouble( 1, 0x8519efbbd62ecull,  7)), // atan Q coefficients
 321         _2(Vc_buildDouble( 1, 0x3504f333f9de6ull,  1)), // tan( 3/8 π )
 322         _2(0.66),                                    // lower threshold for special casing in atan
 323     // cacheline 13
 324         _2(Vc_buildDouble(1, 0x1A62633145C07ull, -54)), // remainder of pi/2
 325         _2(1.e-8), // small asin input threshold
 326         _2(0.625), // large asin input threshold
 327         _2(0.), // padding
 328     // cacheline 14
 329         _2(Vc_buildDouble( 1, 0x84fc3988e9f08ull, -9)), // asinCoeff0
 330         _2(Vc_buildDouble(-1, 0x2079259f9290full, -1)), // asinCoeff0
 331         _2(Vc_buildDouble( 1, 0xbdff5baf33e6aull,  2)), // asinCoeff0
 332         _2(Vc_buildDouble(-1, 0x991aaac01ab68ull,  4)), // asinCoeff0
 333     // cacheline 15
 334         _2(Vc_buildDouble( 1, 0xc896240f3081dull,  4)), // asinCoeff0
 335         _2(Vc_buildDouble(-1, 0x5f2a2b6bf5d8cull,  4)), // asinCoeff1
 336         _2(Vc_buildDouble( 1, 0x26219af6a7f42ull,  7)), // asinCoeff1
 337         _2(Vc_buildDouble(-1, 0x7fe08959063eeull,  8)), // asinCoeff1
 338     // cacheline 16
 339         _2(Vc_buildDouble( 1, 0x56709b0b644beull,  8)), // asinCoeff1
 340         _2(Vc_buildDouble( 1, 0x16b9b0bd48ad3ull, -8)), // asinCoeff2
 341         _2(Vc_buildDouble(-1, 0x34341333e5c16ull, -1)), // asinCoeff2
 342         _2(Vc_buildDouble( 1, 0x5c74b178a2dd9ull,  2)), // asinCoeff2
 343     // cacheline 17
 344         _2(Vc_buildDouble(-1, 0x04331de27907bull,  4)), // asinCoeff2
 345         _2(Vc_buildDouble( 1, 0x39007da779259ull,  4)), // asinCoeff2
 346         _2(Vc_buildDouble(-1, 0x0656c06ceafd5ull,  3)), // asinCoeff2
 347         _2(Vc_buildDouble(-1, 0xd7b590b5e0eabull,  3)), // asinCoeff3
 348     // cacheline 18
 349         _2(Vc_buildDouble( 1, 0x19fc025fe9054ull,  6)), // asinCoeff3
 350         _2(Vc_buildDouble(-1, 0x265bb6d3576d7ull,  7)), // asinCoeff3
 351         _2(Vc_buildDouble( 1, 0x1705684ffbf9dull,  7)), // asinCoeff3
 352         _2(Vc_buildDouble(-1, 0x898220a3607acull,  5)), // asinCoeff3
 353     };
 354 #undef _2
 355 #define _4(x) x, x, x, x
 356     template<> const float c_trig<float>::data[] = {
 357     // cacheline
 358         _4(Vc_buildFloat( 1, 0x490FDB,  -1)), // π/4
 359         _4(Vc_buildFloat( 1, 0x491000,  -1)), // π/4 - 12 bits precision
 360         _4(Vc_buildFloat(-1, 0x157000, -19)), // π/4 remainder1 - 12 bits precision
 361         _4(Vc_buildFloat(-1, 0x6F4B9F, -32)), // π/4 remainder2
 362     // cacheline
 363         _4(0.0625f),
 364         _4(16.f),
 365         _4(0.f), // padding
 366         _4(0.f), // padding
 367     // cacheline
 368         _4(4.166664568298827e-2f),  // ~ 1/4!
 369         _4(-1.388731625493765e-3f), // ~-1/6!
 370         _4(2.443315711809948e-5f),  // ~ 1/8!
 371         _4(0.f), // padding (for alignment with double)
 372     // cacheline
 373         _4(0.f), // padding (for alignment with double)
 374         _4(0.f), // padding (for alignment with double)
 375         _4(-1.6666654611e-1f), // ~-1/3!
 376         _4(8.3321608736e-3f),  // ~ 1/5!
 377     // cacheline
 378         _4(-1.9515295891e-4f), // ~-1/7!
 379         _4(0.f), // padding (for alignment with double)
 380         _4(0.f), // padding (for alignment with double)
 381         _4(0.f), // padding (for alignment with double)
 382     // cacheline
 383         _4(8192.f), // loss threshold
 384         _4(Vc_buildFloat(1, 0x22F983, 0)), // 1.27323949337005615234375 = 4/π
 385         _4(Vc_buildFloat(1, 0x490FDB, 0)), // π/2
 386         _4(Vc_buildFloat(1, 0x490FDB, 1)), // π
 387     // cacheline
 388         _4(8.05374449538e-2f), // atan P coefficients
 389         _4(1.38776856032e-1f), // atan P coefficients
 390         _4(1.99777106478e-1f), // atan P coefficients
 391         _4(3.33329491539e-1f), // atan P coefficients
 392     // cacheline
 393         _4(0.f), // padding (for alignment with double)
 394         _4(0.f), // padding (for alignment with double)
 395         _4(0.f), // padding (for alignment with double)
 396         _4(0.f), // padding (for alignment with double)
 397     // cacheline
 398         _4(0.f), // padding (for alignment with double)
 399         _4(0.f), // padding (for alignment with double)
 400         _4(2.414213562373095f), // tan( 3/8 π )
 401         _4(0.414213562373095f), // tan( 1/8 π ) lower threshold for special casing in atan
 402     // cacheline
 403         _4(Vc_buildFloat(-1, 0x3BBD2E, -25)), // remainder of pi/2
 404         _4(1.e-4f), // small asin input threshold
 405         _4(0.f), // padding (for alignment with double)
 406         _4(0.f), // padding (for alignment with double)
 407     // cacheline
 408         _4(4.2163199048e-2f), // asinCoeff0
 409         _4(2.4181311049e-2f), // asinCoeff0
 410         _4(4.5470025998e-2f), // asinCoeff0
 411         _4(7.4953002686e-2f), // asinCoeff0
 412     // cacheline
 413         _4(1.6666752422e-1f), // asinCoeff0
 414         _4(0.f), // padding (for alignment with double)
 415         _4(0.f), // padding (for alignment with double)
 416         _4(0.f), // padding (for alignment with double)
 417     // cacheline
 418         _4(0.f), // padding (for alignment with double)
 419         _4(0.f), // padding (for alignment with double)
 420         _4(0.f), // padding (for alignment with double)
 421         _4(0.f), // padding (for alignment with double)
 422     // cacheline
 423         _4(0.f), // padding (for alignment with double)
 424         _4(0.f), // padding (for alignment with double)
 425         _4(0.f), // padding (for alignment with double)
 426         _4(0.f), // padding (for alignment with double)
 427     // cacheline
 428         _4(0.f), // padding (for alignment with double)
 429         _4(0.f), // padding (for alignment with double)
 430         _4(0.f), // padding (for alignment with double)
 431         _4(0.f), // padding (for alignment with double)
 432     };
 433 #undef _4
 434
 435     // cacheline 8
 436     V_ALIGN(16) extern const unsigned char _IndexesFromZero16[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
 437
 438     V_ALIGN(64) const unsigned long long c_log<double>::data[21 * 2] = {
 439       /* 0*/   0x000003ff000003ffull, 0x000003ff000003ffull // bias TODO: remove
 440       /* 1*/ , 0x7ff0000000000000ull, 0x7ff0000000000000ull // exponentMask (+inf)
 441
 442       /* 2*/ , 0x3f1ab4c293c31bb0ull, 0x3f1ab4c293c31bb0ull // P[0]
 443       /* 3*/ , 0x3fdfd6f53f5652f2ull, 0x3fdfd6f53f5652f2ull // P[1]
 444       /* 4*/ , 0x4012d2baed926911ull, 0x4012d2baed926911ull // P[2]
 445       /* 5*/ , 0x402cff72c63eeb2eull, 0x402cff72c63eeb2eull // P[3]
 446       /* 6*/ , 0x4031efd6924bc84dull, 0x4031efd6924bc84dull // P[4]
 447       /* 7*/ , 0x401ed5637d7edcf8ull, 0x401ed5637d7edcf8ull // P[5]
 448
 449       /* 8*/ , 0x40269320ae97ef8eull, 0x40269320ae97ef8eull // Q[0]
 450       /* 9*/ , 0x40469d2c4e19c033ull, 0x40469d2c4e19c033ull // Q[1]
 451       /*10*/ , 0x4054bf33a326bdbdull, 0x4054bf33a326bdbdull // Q[2]
 452       /*11*/ , 0x4051c9e2eb5eae21ull, 0x4051c9e2eb5eae21ull // Q[3]
 453       /*12*/ , 0x4037200a9e1f25b2ull, 0x4037200a9e1f25b2ull // Q[4]
 454
 455       /*13*/ , 0xfff0000000000000ull, 0xfff0000000000000ull // -inf
 456       /*14*/ , 0x0010000000000000ull, 0x0010000000000000ull // min()
 457       /*15*/ , 0x3fe6a09e667f3bcdull, 0x3fe6a09e667f3bcdull // 1/sqrt(2)
 458       /*16*/ , 0x3fe6300000000000ull, 0x3fe6300000000000ull // round(ln(2) * 512) / 512
 459       /*17*/ , 0xbf2bd0105c610ca8ull, 0xbf2bd0105c610ca8ull // ln(2) - round(ln(2) * 512) / 512
 460       /*18*/ , 0x3fe0000000000000ull, 0x3fe0000000000000ull // 0.5
 461       /*19*/ , 0x3fdbcb7b1526e50eull, 0x3fdbcb7b1526e50eull // log10(e)
 462       /*20*/ , 0x3ff71547652b82feull, 0x3ff71547652b82feull // log2(e)
 463     };
 464
 465     template<> V_ALIGN(64) const unsigned int c_log<float>::data[21 * 4] = {
 466         0x0000007fu, 0x0000007fu, 0x0000007fu, 0x0000007fu, // bias TODO: remove
 467         0x7f800000u, 0x7f800000u, 0x7f800000u, 0x7f800000u, // exponentMask (+inf)
 468
 469         0x3d9021bbu, 0x3d9021bbu, 0x3d9021bbu, 0x3d9021bbu, //  7.0376836292e-2f // P[0]
 470         0xbdebd1b8u, 0xbdebd1b8u, 0xbdebd1b8u, 0xbdebd1b8u, // -1.1514610310e-1f // P[1]
 471         0x3def251au, 0x3def251au, 0x3def251au, 0x3def251au, //  1.1676998740e-1f // P[2]
 472         0xbdfe5d4fu, 0xbdfe5d4fu, 0xbdfe5d4fu, 0xbdfe5d4fu, // -1.2420140846e-1f // P[3]
 473         0x3e11e9bfu, 0x3e11e9bfu, 0x3e11e9bfu, 0x3e11e9bfu, //  1.4249322787e-1f // P[4]
 474         0xbe2aae50u, 0xbe2aae50u, 0xbe2aae50u, 0xbe2aae50u, // -1.6668057665e-1f // P[5]
 475         0x3e4cceacu, 0x3e4cceacu, 0x3e4cceacu, 0x3e4cceacu, //  2.0000714765e-1f // P[6]
 476         0xbe7ffffcu, 0xbe7ffffcu, 0xbe7ffffcu, 0xbe7ffffcu, // -2.4999993993e-1f // P[7]
 477         0x3eaaaaaau, 0x3eaaaaaau, 0x3eaaaaaau, 0x3eaaaaaau, //  3.3333331174e-1f // P[8]
 478         0,           0,           0,           0,           // padding because of c_log<double>
 479         0,           0,           0,           0,           // padding because of c_log<double>
 480
 481         0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u, // -inf
 482         0x00800000u, 0x00800000u, 0x00800000u, 0x00800000u, // min()
 483         0x3f3504f3u, 0x3f3504f3u, 0x3f3504f3u, 0x3f3504f3u, // 1/sqrt(2)
 484         // ln(2) = 0x3fe62e42fefa39ef
 485         // ln(2) = Vc_buildDouble( 1, 0x00062e42fefa39ef, -1)
 486         //       = Vc_buildFloat( 1, 0x00317217(f7d), -1) + Vc_buildFloat( 1, 0x0077d1cd, -25)
 487         //       = Vc_buildFloat( 1, 0x00318000(000), -1) + Vc_buildFloat(-1, 0x005e8083, -13)
 488         0x3f318000u, 0x3f318000u, 0x3f318000u, 0x3f318000u, // round(ln(2) * 512) / 512
 489         0xb95e8083u, 0xb95e8083u, 0xb95e8083u, 0xb95e8083u, // ln(2) - round(ln(2) * 512) / 512
 490         0x3f000000u, 0x3f000000u, 0x3f000000u, 0x3f000000u, // 0.5
 491         0x3ede5bd9u, 0x3ede5bd9u, 0x3ede5bd9u, 0x3ede5bd9u, // log10(e)
 492         0x3fb8aa3bu, 0x3fb8aa3bu, 0x3fb8aa3bu, 0x3fb8aa3bu, // log2(e)
 493         // log10(2) = 0x3fd34413509f79ff
 494         //          = Vc_buildDouble( 1, 0x00034413509f79ff, -2)
 495         //          = Vc_buildFloat( 1, 0x001a209a(84fbcff8), -2) + Vc_buildFloat( 1, 0x0004fbcff(8), -26)
 496         //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2)
 497         //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2)
 498         //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2)
 499         //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2)
 500     };
 501 } // namespace SSE
 502
 503 V_ALIGN(64) unsigned int RandomState[16] = {
 504     0x5a383a4fu, 0xc68bd45eu, 0x691d6d86u, 0xb367e14fu,
 505     0xd689dbaau, 0xfde442aau, 0x3d265423u, 0x1a77885cu,
 506     0x36ed2684u, 0xfb1f049du, 0x19e52f31u, 0x821e4dd7u,
 507     0x23996d25u, 0x5962725au, 0x6aced4ceu, 0xd4c610f3u
 508 };
 509
 510 // dummy symbol to emit warnings with GCC 4.3
 511 namespace Warnings {
 512     void _operator_bracket_warning() {}
 513 } // namespace Warnings
 514
 515 const char LIBRARY_VERSION[] = VC_VERSION_STRING;
 516 const unsigned int LIBRARY_VERSION_NUMBER = VC_VERSION_NUMBER;
 517 const unsigned int LIBRARY_ABI_VERSION = VC_LIBRARY_ABI_VERSION;
 518
 519 void checkLibraryAbi(unsigned int compileTimeAbi, unsigned int versionNumber, const char *compileTimeVersion) {
 520     if (LIBRARY_ABI_VERSION != compileTimeAbi || LIBRARY_VERSION_NUMBER < versionNumber) {
 521         printf("The versions of libVc.a (%s) and Vc/version.h (%s) are incompatible. Aborting.\n", LIBRARY_VERSION, compileTimeVersion);
 522         abort();
 523     }
 524 }
 525
 526 } // namespace Vc
 527 } // namespace AliRoot
 528
 529 #undef V_ALIGN