Vc/include/Vc/global.h

   1 /*  This file is part of the Vc library.
   2
   3     Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
   4
   5     Vc is free software: you can redistribute it and/or modify
   6     it under the terms of the GNU Lesser General Public License as
   7     published by the Free Software Foundation, either version 3 of
   8     the License, or (at your option) any later version.
   9
  10     Vc is distributed in the hope that it will be useful, but
  11     WITHOUT ANY WARRANTY; without even the implied warranty of
  12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13     GNU Lesser General Public License for more details.
  14
  15     You should have received a copy of the GNU Lesser General Public
  16     License along with Vc.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 */
  19
  20 #ifndef VC_GLOBAL_H
  21 #define VC_GLOBAL_H
  22
  23 #ifndef DOXYGEN
  24
  25 // Compiler defines
  26 #ifdef __INTEL_COMPILER
  27 #define VC_ICC __INTEL_COMPILER_BUILD_DATE
  28 #elif defined(__OPENCC__)
  29 #define VC_OPEN64 1
  30 #elif defined(__clang__)
  31 #define VC_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
  32 #elif defined(__GNUC__)
  33 #define VC_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
  34 #elif defined(_MSC_VER)
  35 #define VC_MSVC _MSC_FULL_VER
  36 #else
  37 #define VC_UNSUPPORTED_COMPILER 1
  38 #endif
  39
  40 // Features/Quirks defines
  41 #if defined VC_MSVC && defined _WIN32
  42 // the Win32 ABI can't handle function parameters with alignment >= 16
  43 #define VC_PASSING_VECTOR_BY_VALUE_IS_BROKEN 1
  44 #endif
  45 #if defined(__GNUC__) && !defined(VC_NO_INLINE_ASM)
  46 #define VC_GNU_ASM 1
  47 #endif
  48 #if defined(VC_GCC) && (VC_GCC <= 0x40405 || (VC_GCC >= 0x40500 && VC_GCC <= 0x40502)) && !(VC_GCC == 0x40502 && defined(__GNUC_UBUNTU_VERSION__) && __GNUC_UBUNTU_VERSION__ == 0xb0408)
  49 // GCC 4.6.0 / 4.5.3 / 4.4.6 switched to the interface as defined by ICC
  50 // (Ubuntu 11.04 ships a GCC 4.5.2 with the new interface)
  51 #define VC_MM256_MASKSTORE_WRONG_MASK_TYPE 1
  52 #endif
  53 #if defined(VC_GCC) && VC_GCC >= 0x40300
  54 #define VC_HAVE_ATTRIBUTE_ERROR 1
  55 #define VC_HAVE_ATTRIBUTE_WARNING 1
  56 #endif
  57
  58 #if (defined(__GXX_EXPERIMENTAL_CXX0X__) && VC_GCC >= 0x40600) || __cplusplus >= 201103
  59 #  define VC_CXX11 1
  60 #  ifdef VC_GCC
  61 #    if VC_GCC >= 0x40700 // && VC_GCC < 0x408000)
  62 //     ::max_align_t was introduced with GCC 4.7. std::max_align_t took a bit longer.
  63 #      define VC_HAVE_MAX_ALIGN_T 1
  64 #    endif
  65 #  elif !defined(VC_CLANG)
  66 //   Clang doesn't provide max_align_t at all
  67 #    define VC_HAVE_STD_MAX_ALIGN_T 1
  68 #  endif
  69 #endif
  70
  71 // ICC ships the AVX2 intrinsics inside the AVX1 header.
  72 // FIXME: the number 20120731 is too large, but I don't know which one is the right one
  73 #if (defined(VC_ICC) && VC_ICC >= 20120731) || (defined(VC_MSVC) && VC_MSVC >= 170000000)
  74 #define VC_UNCONDITIONAL_AVX2_INTRINSICS 1
  75 #endif
  76
  77 /* Define the following strings to a unique integer, which is the only type the preprocessor can
  78  * compare. This allows to use -DVC_IMPL=SSE3. The preprocessor will then consider VC_IMPL and SSE3
  79  * to be equal. Of course, it is important to undefine the strings later on!
  80  */
  81 #define Scalar 0x00100000
  82 #define SSE    0x00200000
  83 #define SSE2   0x00300000
  84 #define SSE3   0x00400000
  85 #define SSSE3  0x00500000
  86 #define SSE4_1 0x00600000
  87 #define SSE4_2 0x00700000
  88 #define AVX    0x00800000
  89
  90 #define XOP    0x00000001
  91 #define FMA4   0x00000002
  92 #define F16C   0x00000004
  93 #define POPCNT 0x00000008
  94 #define SSE4a  0x00000010
  95 #define FMA    0x00000020
  96
  97 #define IMPL_MASK 0xFFF00000
  98 #define EXT_MASK  0x000FFFFF
  99
 100 #ifdef VC_MSVC
 101 # ifdef _M_IX86_FP
 102 #  if _M_IX86_FP >= 1
 103 #   ifndef __SSE__
 104 #    define __SSE__ 1
 105 #   endif
 106 #  endif
 107 #  if _M_IX86_FP >= 2
 108 #   ifndef __SSE2__
 109 #    define __SSE2__ 1
 110 #   endif
 111 #  endif
 112 # elif defined(_M_AMD64)
 113 // If the target is x86_64 then SSE2 is guaranteed
 114 #  ifndef __SSE__
 115 #   define __SSE__ 1
 116 #  endif
 117 #  ifndef __SSE2__
 118 #   define __SSE2__ 1
 119 #  endif
 120 # endif
 121 #endif
 122
 123 #ifndef VC_IMPL
 124
 125 #  if defined(__AVX__)
 126 #    define VC_IMPL_AVX 1
 127 #  else
 128 #    if defined(__SSE4_2__)
 129 #      define VC_IMPL_SSE 1
 130 #      define VC_IMPL_SSE4_2 1
 131 #    endif
 132 #    if defined(__SSE4_1__)
 133 #      define VC_IMPL_SSE 1
 134 #      define VC_IMPL_SSE4_1 1
 135 #    endif
 136 #    if defined(__SSE3__)
 137 #      define VC_IMPL_SSE 1
 138 #      define VC_IMPL_SSE3 1
 139 #    endif
 140 #    if defined(__SSSE3__)
 141 #      define VC_IMPL_SSE 1
 142 #      define VC_IMPL_SSSE3 1
 143 #    endif
 144 #    if defined(__SSE2__)
 145 #      define VC_IMPL_SSE 1
 146 #      define VC_IMPL_SSE2 1
 147 #    endif
 148
 149 #    if defined(VC_IMPL_SSE)
 150        // nothing
 151 #    else
 152 #      define VC_IMPL_Scalar 1
 153 #    endif
 154 #  endif
 155 #  if defined(VC_IMPL_AVX) || defined(VC_IMPL_SSE)
 156 #    ifdef __FMA4__
 157 #      define VC_IMPL_FMA4 1
 158 #    endif
 159 #    ifdef __XOP__
 160 #      define VC_IMPL_XOP 1
 161 #    endif
 162 #    ifdef __F16C__
 163 #      define VC_IMPL_F16C 1
 164 #    endif
 165 #    ifdef __POPCNT__
 166 #      define VC_IMPL_POPCNT 1
 167 #    endif
 168 #    ifdef __SSE4A__
 169 #      define VC_IMPL_SSE4a 1
 170 #    endif
 171 #    ifdef __FMA__
 172 #      define VC_IMPL_FMA 1
 173 #    endif
 174 #  endif
 175
 176 #else // VC_IMPL
 177
 178 #  if (VC_IMPL & IMPL_MASK) == AVX // AVX supersedes SSE
 179 #    define VC_IMPL_AVX 1
 180 #  elif (VC_IMPL & IMPL_MASK) == Scalar
 181 #    define VC_IMPL_Scalar 1
 182 #  elif (VC_IMPL & IMPL_MASK) == SSE4_2
 183 #    define VC_IMPL_SSE4_2 1
 184 #    define VC_IMPL_SSE4_1 1
 185 #    define VC_IMPL_SSSE3 1
 186 #    define VC_IMPL_SSE3 1
 187 #    define VC_IMPL_SSE2 1
 188 #    define VC_IMPL_SSE 1
 189 #  elif (VC_IMPL & IMPL_MASK) == SSE4_1
 190 #    define VC_IMPL_SSE4_1 1
 191 #    define VC_IMPL_SSSE3 1
 192 #    define VC_IMPL_SSE3 1
 193 #    define VC_IMPL_SSE2 1
 194 #    define VC_IMPL_SSE 1
 195 #  elif (VC_IMPL & IMPL_MASK) == SSSE3
 196 #    define VC_IMPL_SSSE3 1
 197 #    define VC_IMPL_SSE3 1
 198 #    define VC_IMPL_SSE2 1
 199 #    define VC_IMPL_SSE 1
 200 #  elif (VC_IMPL & IMPL_MASK) == SSE3
 201 #    define VC_IMPL_SSE3 1
 202 #    define VC_IMPL_SSE2 1
 203 #    define VC_IMPL_SSE 1
 204 #  elif (VC_IMPL & IMPL_MASK) == SSE2
 205 #    define VC_IMPL_SSE2 1
 206 #    define VC_IMPL_SSE 1
 207 #  elif (VC_IMPL & IMPL_MASK) == SSE
 208 #    define VC_IMPL_SSE 1
 209 #    if defined(__SSE4_2__)
 210 #      define VC_IMPL_SSE4_2 1
 211 #    endif
 212 #    if defined(__SSE4_1__)
 213 #      define VC_IMPL_SSE4_1 1
 214 #    endif
 215 #    if defined(__SSE3__)
 216 #      define VC_IMPL_SSE3 1
 217 #    endif
 218 #    if defined(__SSSE3__)
 219 #      define VC_IMPL_SSSE3 1
 220 #    endif
 221 #    if defined(__SSE2__)
 222 #      define VC_IMPL_SSE2 1
 223 #    endif
 224 #  elif (VC_IMPL & IMPL_MASK) == 0 && (VC_IMPL & SSE4a)
 225      // this is for backward compatibility only where SSE4a was included in the main
 226      // line of available SIMD instruction sets
 227 #    define VC_IMPL_SSE3 1
 228 #    define VC_IMPL_SSE2 1
 229 #    define VC_IMPL_SSE 1
 230 #  endif
 231 #  if (VC_IMPL & XOP)
 232 #    define VC_IMPL_XOP 1
 233 #  endif
 234 #  if (VC_IMPL & FMA4)
 235 #    define VC_IMPL_FMA4 1
 236 #  endif
 237 #  if (VC_IMPL & F16C)
 238 #    define VC_IMPL_F16C 1
 239 #  endif
 240 #  if (VC_IMPL & POPCNT)
 241 #    define VC_IMPL_POPCNT 1
 242 #  endif
 243 #  if (VC_IMPL & SSE4a)
 244 #    define VC_IMPL_SSE4a 1
 245 #  endif
 246 #  if (VC_IMPL & FMA)
 247 #    define VC_IMPL_FMA 1
 248 #  endif
 249 #  undef VC_IMPL
 250
 251 #endif // VC_IMPL
 252
 253 // If AVX is enabled in the compiler it will use VEX coding for the SIMD instructions.
 254 #ifdef __AVX__
 255 #  define VC_USE_VEX_CODING 1
 256 #endif
 257
 258 #if defined(VC_GCC) && VC_GCC < 0x40300 && !defined(VC_IMPL_Scalar)
 259 #    ifndef VC_DONT_WARN_OLD_GCC
 260 #      warning "GCC < 4.3 does not have full support for SSE2 intrinsics. Using scalar types/operations only. Define VC_DONT_WARN_OLD_GCC to silence this warning."
 261 #    endif
 262 #    undef VC_IMPL_SSE
 263 #    undef VC_IMPL_SSE2
 264 #    undef VC_IMPL_SSE3
 265 #    undef VC_IMPL_SSE4_1
 266 #    undef VC_IMPL_SSE4_2
 267 #    undef VC_IMPL_SSSE3
 268 #    undef VC_IMPL_AVX
 269 #    undef VC_IMPL_FMA4
 270 #    undef VC_IMPL_XOP
 271 #    undef VC_IMPL_F16C
 272 #    undef VC_IMPL_POPCNT
 273 #    undef VC_IMPL_SSE4a
 274 #    undef VC_IMPL_FMA
 275 #    undef VC_USE_VEX_CODING
 276 #    define VC_IMPL_Scalar 1
 277 #endif
 278
 279 # if !defined(VC_IMPL_Scalar) && !defined(VC_IMPL_SSE) && !defined(VC_IMPL_AVX)
 280 #  error "No suitable Vc implementation was selected! Probably VC_IMPL was set to an invalid value."
 281 # elif defined(VC_IMPL_SSE) && !defined(VC_IMPL_SSE2)
 282 #  error "SSE requested but no SSE2 support. Vc needs at least SSE2!"
 283 # endif
 284
 285 #undef Scalar
 286 #undef SSE
 287 #undef SSE2
 288 #undef SSE3
 289 #undef SSSE3
 290 #undef SSE4_1
 291 #undef SSE4_2
 292 #undef AVX
 293
 294 #undef XOP
 295 #undef FMA4
 296 #undef F16C
 297 #undef POPCNT
 298 #undef SSE4a
 299 #undef FMA
 300
 301 #undef IMPL_MASK
 302 #undef EXT_MASK
 303
 304 namespace AliRoot {
 305 namespace Vc {
 306 enum AlignedFlag {
 307     Aligned = 0
 308 };
 309 enum UnalignedFlag {
 310     Unaligned = 1
 311 };
 312 enum StreamingAndAlignedFlag { // implies Aligned
 313     Streaming = 2
 314 };
 315 enum StreamingAndUnalignedFlag {
 316     StreamingAndUnaligned = 3
 317 };
 318 #endif // DOXYGEN
 319
 320 /**
 321  * \ingroup Utilities
 322  *
 323  * Enum that specifies the alignment and padding restrictions to use for memory allocation with
 324  * Vc::malloc.
 325  */
 326 enum MallocAlignment {
 327     /**
 328      * Align on boundary of vector sizes (e.g. 16 Bytes on SSE platforms) and pad to allow
 329      * vector access to the end. Thus the allocated memory contains a multiple of
 330      * VectorAlignment bytes.
 331      */
 332     AlignOnVector,
 333     /**
 334      * Align on boundary of cache line sizes (e.g. 64 Bytes on x86) and pad to allow
 335      * full cache line access to the end. Thus the allocated memory contains a multiple of
 336      * 64 bytes.
 337      */
 338     AlignOnCacheline,
 339     /**
 340      * Align on boundary of page sizes (e.g. 4096 Bytes on x86) and pad to allow
 341      * full page access to the end. Thus the allocated memory contains a multiple of
 342      * 4096 bytes.
 343      */
 344     AlignOnPage
 345 };
 346
 347 #if __cplusplus >= 201103 /*C++11*/
 348 #define Vc_CONSTEXPR static constexpr
 349 #elif defined(__GNUC__)
 350 #define Vc_CONSTEXPR static inline __attribute__((__always_inline__, __const__))
 351 #elif defined(VC_MSVC)
 352 #define Vc_CONSTEXPR static inline __forceinline
 353 #else
 354 #define Vc_CONSTEXPR static inline
 355 #endif
 356 Vc_CONSTEXPR StreamingAndUnalignedFlag operator|(UnalignedFlag, StreamingAndAlignedFlag) { return StreamingAndUnaligned; }
 357 Vc_CONSTEXPR StreamingAndUnalignedFlag operator|(StreamingAndAlignedFlag, UnalignedFlag) { return StreamingAndUnaligned; }
 358 Vc_CONSTEXPR StreamingAndUnalignedFlag operator&(UnalignedFlag, StreamingAndAlignedFlag) { return StreamingAndUnaligned; }
 359 Vc_CONSTEXPR StreamingAndUnalignedFlag operator&(StreamingAndAlignedFlag, UnalignedFlag) { return StreamingAndUnaligned; }
 360
 361 Vc_CONSTEXPR StreamingAndAlignedFlag operator|(AlignedFlag, StreamingAndAlignedFlag) { return Streaming; }
 362 Vc_CONSTEXPR StreamingAndAlignedFlag operator|(StreamingAndAlignedFlag, AlignedFlag) { return Streaming; }
 363 Vc_CONSTEXPR StreamingAndAlignedFlag operator&(AlignedFlag, StreamingAndAlignedFlag) { return Streaming; }
 364 Vc_CONSTEXPR StreamingAndAlignedFlag operator&(StreamingAndAlignedFlag, AlignedFlag) { return Streaming; }
 365
 366 /**
 367  * \ingroup Utilities
 368  *
 369  * Enum to identify a certain SIMD instruction set.
 370  *
 371  * You can use \ref VC_IMPL for the currently active implementation.
 372  *
 373  * \see ExtraInstructions
 374  */
 375 enum Implementation {
 376     /// uses only fundamental types
 377     ScalarImpl,
 378     /// x86 SSE + SSE2
 379     SSE2Impl,
 380     /// x86 SSE + SSE2 + SSE3
 381     SSE3Impl,
 382     /// x86 SSE + SSE2 + SSE3 + SSSE3
 383     SSSE3Impl,
 384     /// x86 SSE + SSE2 + SSE3 + SSSE3 + SSE4.1
 385     SSE41Impl,
 386     /// x86 SSE + SSE2 + SSE3 + SSSE3 + SSE4.1 + SSE4.2
 387     SSE42Impl,
 388     /// x86 AVX
 389     AVXImpl,
 390     /// x86 AVX + AVX2
 391     AVX2Impl,
 392     ImplementationMask = 0xfff
 393 };
 394
 395 /**
 396  * \ingroup Utilities
 397  *
 398  * The list of available instructions is not easily described by a linear list of instruction sets.
 399  * On x86 the following instruction sets always include their predecessors:
 400  * SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2
 401  *
 402  * But there are additional instructions that are not necessarily required by this list. These are
 403  * covered in this enum.
 404  */
 405 enum ExtraInstructions {
 406     //! Support for float16 conversions in hardware
 407     Float16cInstructions  = 0x01000,
 408     //! Support for FMA4 instructions
 409     Fma4Instructions      = 0x02000,
 410     //! Support for XOP instructions
 411     XopInstructions       = 0x04000,
 412     //! Support for the population count instruction
 413     PopcntInstructions    = 0x08000,
 414     //! Support for SSE4a instructions
 415     Sse4aInstructions     = 0x10000,
 416     //! Support for FMA instructions (3 operand variant)
 417     FmaInstructions       = 0x20000,
 418     // PclmulqdqInstructions,
 419     // AesInstructions,
 420     // RdrandInstructions
 421     ExtraInstructionsMask = 0xfffff000u
 422 };
 423
 424 #ifndef DOXYGEN
 425
 426 #ifdef VC_IMPL_Scalar
 427 #define VC_IMPL ::AliRoot::Vc::ScalarImpl
 428 #elif defined(VC_IMPL_AVX)
 429 #define VC_IMPL ::AliRoot::Vc::AVXImpl
 430 #elif defined(VC_IMPL_SSE4_2)
 431 #define VC_IMPL ::AliRoot::Vc::SSE42Impl
 432 #elif defined(VC_IMPL_SSE4_1)
 433 #define VC_IMPL ::AliRoot::Vc::SSE41Impl
 434 #elif defined(VC_IMPL_SSSE3)
 435 #define VC_IMPL ::AliRoot::Vc::SSSE3Impl
 436 #elif defined(VC_IMPL_SSE3)
 437 #define VC_IMPL ::AliRoot::Vc::SSE3Impl
 438 #elif defined(VC_IMPL_SSE2)
 439 #define VC_IMPL ::AliRoot::Vc::SSE2Impl
 440 #endif
 441
 442 template<unsigned int Features> struct ImplementationT { enum _Value {
 443     Value = Features,
 444     Implementation = Features & Vc::ImplementationMask,
 445     ExtraInstructions = Features & Vc::ExtraInstructionsMask
 446 }; };
 447
 448 typedef ImplementationT<
 449 #ifdef VC_USE_VEX_CODING
 450     // everything will use VEX coding, so the system has to support AVX even if VC_IMPL_AVX is not set
 451     // but AFAIU the OSXSAVE and xgetbv tests do not have to positive (unless, of course, the
 452     // compiler decides to insert an instruction that uses the full register size - so better be on
 453     // the safe side)
 454     AVXImpl
 455 #else
 456     VC_IMPL
 457 #endif
 458 #ifdef VC_IMPL_SSE4a
 459     + Vc::Sse4aInstructions
 460 #ifdef VC_IMPL_XOP
 461     + Vc::XopInstructions
 462 #ifdef VC_IMPL_FMA4
 463     + Vc::Fma4Instructions
 464 #endif
 465 #endif
 466 #endif
 467 #ifdef VC_IMPL_POPCNT
 468     + Vc::PopcntInstructions
 469 #endif
 470 #ifdef VC_IMPL_FMA
 471     + Vc::FmaInstructions
 472 #endif
 473     > CurrentImplementation;
 474
 475 namespace Internal {
 476     template<Implementation Impl> struct HelperImpl;
 477     typedef HelperImpl<VC_IMPL> Helper;
 478
 479     template<typename A> struct FlagObject;
 480     template<> struct FlagObject<AlignedFlag> { Vc_CONSTEXPR AlignedFlag the() { return Aligned; } };
 481     template<> struct FlagObject<UnalignedFlag> { Vc_CONSTEXPR UnalignedFlag the() { return Unaligned; } };
 482     template<> struct FlagObject<StreamingAndAlignedFlag> { Vc_CONSTEXPR StreamingAndAlignedFlag the() { return Streaming; } };
 483     template<> struct FlagObject<StreamingAndUnalignedFlag> { Vc_CONSTEXPR StreamingAndUnalignedFlag the() { return StreamingAndUnaligned; } };
 484 } // namespace Internal
 485
 486 namespace Warnings
 487 {
 488     void _operator_bracket_warning()
 489 #ifdef VC_HAVE_ATTRIBUTE_WARNING
 490         __attribute__((warning("\n\tUse of Vc::Vector::operator[] to modify scalar entries is known to miscompile with GCC 4.3.x.\n\tPlease upgrade to a more recent GCC or avoid operator[] altogether.\n\t(This warning adds an unnecessary function call to operator[] which should work around the problem at a little extra cost.)")))
 491 #endif
 492         ;
 493 } // namespace Warnings
 494
 495 namespace Error
 496 {
 497     template<typename L, typename R> struct invalid_operands_of_types {};
 498 } // namespace Error
 499
 500 #endif // DOXYGEN
 501 } // namespace Vc
 502 } // namespace AliRoot
 503
 504 #undef Vc_CONSTEXPR
 505 #include "version.h"
 506
 507 #endif // VC_GLOBAL_H