- support compilation with GCC 4.1 and 4.2
authorsgorbuno <sgorbuno@f7af4fe6-9843-0410-8265-dc069ae4e863>
Tue, 17 Jul 2012 10:31:43 +0000 (10:31 +0000)
committersgorbuno <sgorbuno@f7af4fe6-9843-0410-8265-dc069ae4e863>
Tue, 17 Jul 2012 10:31:43 +0000 (10:31 +0000)
- add TARGET_ARCHITECTURE=none where no flags will be added
- change TARGET_ARCHITECTURE default to none for AliRoot

Vc/cmake/AddCompilerFlag.cmake
Vc/cmake/OptimizeForArchitecture.cmake
Vc/cmake/VcMacros.cmake
Vc/include/Vc/IO
Vc/include/Vc/common/macros.h
Vc/include/Vc/cpuid.h
Vc/src/cpuid.cpp

index 30eb7e8..75ad694 100644 (file)
@@ -2,7 +2,7 @@ get_filename_component(_currentDir "${CMAKE_CURRENT_LIST_FILE}" PATH)
 include("${_currentDir}/CheckCCompilerFlag.cmake")
 include("${_currentDir}/CheckCXXCompilerFlag.cmake")
 macro(AddCompilerFlag _flag)
-   string(REGEX REPLACE "[-+/:= ]" "_" _flag_esc "${_flag}")
+   string(REGEX REPLACE "[-.+/:= ]" "_" _flag_esc "${_flag}")
    check_c_compiler_flag("${_flag}" check_c_compiler_flag_${_flag_esc})
    check_cxx_compiler_flag("${_flag}" check_cxx_compiler_flag_${_flag_esc})
 
index 01f435c..8194283 100644 (file)
@@ -97,7 +97,7 @@ macro(AutodetectHostArchitecture)
 endmacro()
 
 macro(OptimizeForArchitecture)
-   set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used.\nSetting the value to \"auto\" will try to optimize for the architecture where cmake is called.\nOther supported values are: \"generic\", \"core\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandy-bridge\", \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\".")
+   set(TARGET_ARCHITECTURE "none" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used.\nSetting the value to \"auto\" will try to optimize for the architecture where cmake is called.\nOther supported values are: \"none\", \"generic\", \"core\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandy-bridge\", \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\".")
    set(_force)
    if(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}")
       message(STATUS "target changed from \"${_last_target_arch}\" to \"${TARGET_ARCHITECTURE}\"")
@@ -183,157 +183,166 @@ macro(OptimizeForArchitecture)
       list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a")
    elseif(TARGET_ARCHITECTURE STREQUAL "generic")
       list(APPEND _march_flag_list "generic")
+   elseif(TARGET_ARCHITECTURE STREQUAL "none")
+      # add this clause to remove it from the else clause
    else(TARGET_ARCHITECTURE STREQUAL "core")
       message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.")
    endif(TARGET_ARCHITECTURE STREQUAL "core")
 
-   set(_disable_vector_unit_list)
-   set(_enable_vector_unit_list)
-   _my_find(_available_vector_units_list "sse2" SSE2_FOUND)
-   _my_find(_available_vector_units_list "sse3" SSE3_FOUND)
-   _my_find(_available_vector_units_list "ssse3" SSSE3_FOUND)
-   _my_find(_available_vector_units_list "sse4.1" SSE4_1_FOUND)
-   _my_find(_available_vector_units_list "sse4.2" SSE4_2_FOUND)
-   _my_find(_available_vector_units_list "sse4a" SSE4a_FOUND)
-   if(DEFINED Vc_AVX_INTRINSICS_BROKEN AND Vc_AVX_INTRINSICS_BROKEN)
-      UserWarning("AVX disabled per default because of old/broken compiler")
-      set(AVX_FOUND false)
-      set(XOP_FOUND false)
-      set(FMA4_FOUND false)
-   else()
-      _my_find(_available_vector_units_list "avx" AVX_FOUND)
-      _my_find(_available_vector_units_list "xop" XOP_FOUND)
-      _my_find(_available_vector_units_list "fma4" FMA4_FOUND)
-   endif()
-   set(USE_SSE2   ${SSE2_FOUND}   CACHE BOOL "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." ${_force})
-   set(USE_SSE3   ${SSE3_FOUND}   CACHE BOOL "Use SSE3. If SSE3 instructions are not enabled they will be emulated." ${_force})
-   set(USE_SSSE3  ${SSSE3_FOUND}  CACHE BOOL "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." ${_force})
-   set(USE_SSE4_1 ${SSE4_1_FOUND} CACHE BOOL "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." ${_force})
-   set(USE_SSE4_2 ${SSE4_2_FOUND} CACHE BOOL "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." ${_force})
-   set(USE_SSE4a  ${SSE4a_FOUND}  CACHE BOOL "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." ${_force})
-   set(USE_AVX    ${AVX_FOUND}    CACHE BOOL "Use AVX. This will double some of the vector sizes relative to SSE." ${_force})
-   set(USE_XOP    ${XOP_FOUND}    CACHE BOOL "Use XOP." ${_force})
-   set(USE_FMA4   ${FMA4_FOUND}   CACHE BOOL "Use FMA4." ${_force})
-   mark_as_advanced(USE_SSE2 USE_SSE3 USE_SSSE3 USE_SSE4_1 USE_SSE4_2 USE_SSE4a USE_AVX USE_XOP USE_FMA4)
-   if(USE_SSE2)
-      list(APPEND _enable_vector_unit_list "sse2")
-   else(USE_SSE2)
-      list(APPEND _disable_vector_unit_list "sse2")
-   endif(USE_SSE2)
-   if(USE_SSE3)
-      list(APPEND _enable_vector_unit_list "sse3")
-   else(USE_SSE3)
-      list(APPEND _disable_vector_unit_list "sse3")
-   endif(USE_SSE3)
-   if(USE_SSSE3)
-      list(APPEND _enable_vector_unit_list "ssse3")
-   else(USE_SSSE3)
-      list(APPEND _disable_vector_unit_list "ssse3")
-   endif(USE_SSSE3)
-   if(USE_SSE4_1)
-      list(APPEND _enable_vector_unit_list "sse4.1")
-   else(USE_SSE4_1)
-      list(APPEND _disable_vector_unit_list "sse4.1")
-   endif(USE_SSE4_1)
-   if(USE_SSE4_2)
-      list(APPEND _enable_vector_unit_list "sse4.2")
-   else(USE_SSE4_2)
-      list(APPEND _disable_vector_unit_list "sse4.2")
-   endif(USE_SSE4_2)
-   if(USE_SSE4a)
-      list(APPEND _enable_vector_unit_list "sse4a")
-   else(USE_SSE4a)
-      list(APPEND _disable_vector_unit_list "sse4a")
-   endif(USE_SSE4a)
-   if(USE_AVX)
-      list(APPEND _enable_vector_unit_list "avx")
-      # we want SSE intrinsics to result in instructions using the VEX prefix.
-      # Otherwise integer ops (which require the older SSE intrinsics) would
-      # always have a large penalty.
-      list(APPEND _enable_vector_unit_list "sse2avx")
-   else(USE_AVX)
-      list(APPEND _disable_vector_unit_list "avx")
-   endif(USE_AVX)
-   if(USE_XOP)
-      list(APPEND _enable_vector_unit_list "xop")
-   else()
-      list(APPEND _disable_vector_unit_list "xop")
-   endif()
-   if(USE_FMA4)
-      list(APPEND _enable_vector_unit_list "fma4")
-   else()
-      list(APPEND _disable_vector_unit_list "fma4")
-   endif()
-   if(MSVC)
-      # MSVC on 32 bit can select /arch:SSE2 (since 2010 also /arch:AVX)
-      # MSVC on 64 bit cannot select anything (should have changed with MSVC 2010)
-      _my_find(_enable_vector_unit_list "avx" _avx)
-      set(_avx_flag FALSE)
-      if(_avx)
-         AddCompilerFlag("/arch:AVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _avx_flag)
-      endif()
-      if(NOT _avx_flag)
-         _my_find(_enable_vector_unit_list "sse2" _found)
-         if(_found)
-            AddCompilerFlag("/arch:SSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+   if(NOT TARGET_ARCHITECTURE STREQUAL "none")
+      set(_disable_vector_unit_list)
+      set(_enable_vector_unit_list)
+      _my_find(_available_vector_units_list "sse2" SSE2_FOUND)
+      _my_find(_available_vector_units_list "sse3" SSE3_FOUND)
+      _my_find(_available_vector_units_list "ssse3" SSSE3_FOUND)
+      _my_find(_available_vector_units_list "sse4.1" SSE4_1_FOUND)
+      _my_find(_available_vector_units_list "sse4.2" SSE4_2_FOUND)
+      _my_find(_available_vector_units_list "sse4a" SSE4a_FOUND)
+      if(DEFINED Vc_AVX_INTRINSICS_BROKEN AND Vc_AVX_INTRINSICS_BROKEN)
+         UserWarning("AVX disabled per default because of old/broken compiler")
+         set(AVX_FOUND false)
+         set(XOP_FOUND false)
+         set(FMA4_FOUND false)
+      else()
+         _my_find(_available_vector_units_list "avx" AVX_FOUND)
+         _my_find(_available_vector_units_list "fma4" FMA4_FOUND)
+         if(DEFINED Vc_XOP_INTRINSICS_BROKEN AND Vc_XOP_INTRINSICS_BROKEN)
+            UserWarning("XOP disabled per default because of old/broken compiler")
+            set(XOP_FOUND false)
+         else()
+            _my_find(_available_vector_units_list "xop" XOP_FOUND)
          endif()
       endif()
-      foreach(_flag ${_enable_vector_unit_list})
-         string(TOUPPER "${_flag}" _flag)
-         string(REPLACE "." "_" _flag "__${_flag}__")
-         add_definitions("-D${_flag}")
-      endforeach(_flag)
-   elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") # ICC (on Linux)
-      _my_find(_available_vector_units_list "avx"    _found)
-      if(_found)
-         AddCompilerFlag("-xAVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
-      else(_found)
-         _my_find(_available_vector_units_list "sse4.2" _found)
+      set(USE_SSE2   ${SSE2_FOUND}   CACHE BOOL "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." ${_force})
+      set(USE_SSE3   ${SSE3_FOUND}   CACHE BOOL "Use SSE3. If SSE3 instructions are not enabled they will be emulated." ${_force})
+      set(USE_SSSE3  ${SSSE3_FOUND}  CACHE BOOL "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." ${_force})
+      set(USE_SSE4_1 ${SSE4_1_FOUND} CACHE BOOL "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." ${_force})
+      set(USE_SSE4_2 ${SSE4_2_FOUND} CACHE BOOL "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." ${_force})
+      set(USE_SSE4a  ${SSE4a_FOUND}  CACHE BOOL "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." ${_force})
+      set(USE_AVX    ${AVX_FOUND}    CACHE BOOL "Use AVX. This will double some of the vector sizes relative to SSE." ${_force})
+      set(USE_XOP    ${XOP_FOUND}    CACHE BOOL "Use XOP." ${_force})
+      set(USE_FMA4   ${FMA4_FOUND}   CACHE BOOL "Use FMA4." ${_force})
+      mark_as_advanced(USE_SSE2 USE_SSE3 USE_SSSE3 USE_SSE4_1 USE_SSE4_2 USE_SSE4a USE_AVX USE_XOP USE_FMA4)
+      if(USE_SSE2)
+         list(APPEND _enable_vector_unit_list "sse2")
+      else(USE_SSE2)
+         list(APPEND _disable_vector_unit_list "sse2")
+      endif(USE_SSE2)
+      if(USE_SSE3)
+         list(APPEND _enable_vector_unit_list "sse3")
+      else(USE_SSE3)
+         list(APPEND _disable_vector_unit_list "sse3")
+      endif(USE_SSE3)
+      if(USE_SSSE3)
+         list(APPEND _enable_vector_unit_list "ssse3")
+      else(USE_SSSE3)
+         list(APPEND _disable_vector_unit_list "ssse3")
+      endif(USE_SSSE3)
+      if(USE_SSE4_1)
+         list(APPEND _enable_vector_unit_list "sse4.1")
+      else(USE_SSE4_1)
+         list(APPEND _disable_vector_unit_list "sse4.1")
+      endif(USE_SSE4_1)
+      if(USE_SSE4_2)
+         list(APPEND _enable_vector_unit_list "sse4.2")
+      else(USE_SSE4_2)
+         list(APPEND _disable_vector_unit_list "sse4.2")
+      endif(USE_SSE4_2)
+      if(USE_SSE4a)
+         list(APPEND _enable_vector_unit_list "sse4a")
+      else(USE_SSE4a)
+         list(APPEND _disable_vector_unit_list "sse4a")
+      endif(USE_SSE4a)
+      if(USE_AVX)
+         list(APPEND _enable_vector_unit_list "avx")
+         # we want SSE intrinsics to result in instructions using the VEX prefix.
+         # Otherwise integer ops (which require the older SSE intrinsics) would
+         # always have a large penalty.
+         list(APPEND _enable_vector_unit_list "sse2avx")
+      else(USE_AVX)
+         list(APPEND _disable_vector_unit_list "avx")
+      endif(USE_AVX)
+      if(USE_XOP)
+         list(APPEND _enable_vector_unit_list "xop")
+      else()
+         list(APPEND _disable_vector_unit_list "xop")
+      endif()
+      if(USE_FMA4)
+         list(APPEND _enable_vector_unit_list "fma4")
+      else()
+         list(APPEND _disable_vector_unit_list "fma4")
+      endif()
+      if(MSVC)
+         # MSVC on 32 bit can select /arch:SSE2 (since 2010 also /arch:AVX)
+         # MSVC on 64 bit cannot select anything (should have changed with MSVC 2010)
+         _my_find(_enable_vector_unit_list "avx" _avx)
+         set(_avx_flag FALSE)
+         if(_avx)
+            AddCompilerFlag("/arch:AVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS CXX_RESULT _avx_flag)
+         endif()
+         if(NOT _avx_flag)
+            _my_find(_enable_vector_unit_list "sse2" _found)
+            if(_found)
+               AddCompilerFlag("/arch:SSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+            endif()
+         endif()
+         foreach(_flag ${_enable_vector_unit_list})
+            string(TOUPPER "${_flag}" _flag)
+            string(REPLACE "." "_" _flag "__${_flag}__")
+            add_definitions("-D${_flag}")
+         endforeach(_flag)
+      elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") # ICC (on Linux)
+         _my_find(_available_vector_units_list "avx"    _found)
          if(_found)
-            AddCompilerFlag("-xSSE4.2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+            AddCompilerFlag("-xAVX" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
          else(_found)
-            _my_find(_available_vector_units_list "sse4.1" _found)
+            _my_find(_available_vector_units_list "sse4.2" _found)
             if(_found)
-               AddCompilerFlag("-xSSE4.1" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+               AddCompilerFlag("-xSSE4.2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
             else(_found)
-               _my_find(_available_vector_units_list "ssse3"  _found)
+               _my_find(_available_vector_units_list "sse4.1" _found)
                if(_found)
-                  AddCompilerFlag("-xSSSE3" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+                  AddCompilerFlag("-xSSE4.1" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
                else(_found)
-                  _my_find(_available_vector_units_list "sse3"   _found)
+                  _my_find(_available_vector_units_list "ssse3"  _found)
                   if(_found)
-                     # If the target host is an AMD machine then we still want to use -xSSE2 because the binary would refuse to run at all otherwise
-                     _my_find(_march_flag_list "barcelona" _found)
-                     if(NOT _found)
-                        _my_find(_march_flag_list "k8-sse3" _found)
-                     endif(NOT _found)
-                     if(_found)
-                        AddCompilerFlag("-xSSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
-                     else(_found)
-                        AddCompilerFlag("-xSSE3" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
-                     endif(_found)
+                     AddCompilerFlag("-xSSSE3" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
                   else(_found)
-                     _my_find(_available_vector_units_list "sse2"   _found)
+                     _my_find(_available_vector_units_list "sse3"   _found)
                      if(_found)
-                        AddCompilerFlag("-xSSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+                        # If the target host is an AMD machine then we still want to use -xSSE2 because the binary would refuse to run at all otherwise
+                        _my_find(_march_flag_list "barcelona" _found)
+                        if(NOT _found)
+                           _my_find(_march_flag_list "k8-sse3" _found)
+                        endif(NOT _found)
+                        if(_found)
+                           AddCompilerFlag("-xSSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+                        else(_found)
+                           AddCompilerFlag("-xSSE3" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+                        endif(_found)
+                     else(_found)
+                        _my_find(_available_vector_units_list "sse2"   _found)
+                        if(_found)
+                           AddCompilerFlag("-xSSE2" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+                        endif(_found)
                      endif(_found)
                   endif(_found)
                endif(_found)
             endif(_found)
          endif(_found)
-      endif(_found)
-   else() # not MSVC and not ICC => GCC, Clang, Open64
-      foreach(_flag ${_march_flag_list})
-         AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
-         if(_good)
-            break()
-         endif(_good)
-      endforeach(_flag)
-      foreach(_flag ${_enable_vector_unit_list})
-         AddCompilerFlag("-m${_flag}" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
-      endforeach(_flag)
-      foreach(_flag ${_disable_vector_unit_list})
-         AddCompilerFlag("-mno-${_flag}" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
-      endforeach(_flag)
+      else() # not MSVC and not ICC => GCC, Clang, Open64
+         foreach(_flag ${_march_flag_list})
+            AddCompilerFlag("-march=${_flag}" CXX_RESULT _good CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+            if(_good)
+               break()
+            endif(_good)
+         endforeach(_flag)
+         foreach(_flag ${_enable_vector_unit_list})
+            AddCompilerFlag("-m${_flag}" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+         endforeach(_flag)
+         foreach(_flag ${_disable_vector_unit_list})
+            AddCompilerFlag("-mno-${_flag}" CXX_FLAGS Vc_ARCHITECTURE_FLAGS)
+         endforeach(_flag)
+      endif()
    endif()
 endmacro(OptimizeForArchitecture)
index 2e3a267..ddfda57 100644 (file)
@@ -123,7 +123,7 @@ macro(vc_check_assembler)
          string(REGEX REPLACE "\\([^\\)]*\\)" "" _as_version "${_as_version}")
          string(REGEX MATCH "[1-9]\\.[0-9]+(\\.[0-9]+)?" _as_version "${_as_version}")
          if(_as_version VERSION_LESS "2.18.93")
-            message(WARNING "Your binutils is too old (${_as_version}). Some optimizations of Vc will be disabled.")
+            UserWarning("Your binutils is too old (${_as_version}). Some optimizations of Vc will be disabled.")
             add_definitions(-DVC_NO_XGETBV) # old assembler doesn't know the xgetbv instruction
          endif()
       endif()
@@ -157,6 +157,7 @@ macro(vc_set_preferred_compiler_flags)
 
    set(Vc_SSE_INTRINSICS_BROKEN false)
    set(Vc_AVX_INTRINSICS_BROKEN false)
+   set(Vc_XOP_INTRINSICS_BROKEN false)
 
    if(Vc_COMPILER_IS_OPEN64)
       ##################################################################################################
@@ -202,6 +203,11 @@ macro(vc_set_preferred_compiler_flags)
             # GCC gives bogus "array subscript is above array bounds" warnings in math.cpp
             AddCompilerFlag("-Wno-array-bounds")
          endif()
+         if(Vc_GCC_VERSION VERSION_GREATER "4.7.99")
+            # GCC 4.8 warns about stuff we don't care about
+            # Some older GCC versions have problems to note that they don't support the flag
+            AddCompilerFlag("-Wno-unused-local-typedefs")
+         endif()
       endif()
       vc_add_compiler_flag(Vc_DEFINITIONS "-Wabi")
       vc_add_compiler_flag(Vc_DEFINITIONS "-fabi-version=0") # ABI version 4 is required to make __m128 and __m256 appear as different types. 0 should give us the latest version.
@@ -217,7 +223,12 @@ macro(vc_set_preferred_compiler_flags)
          AddCompilerFlag("--param early-inlining-insns=12")
       endif()
 
-      if(Vc_GCC_VERSION VERSION_LESS "4.4.6")
+      if(Vc_GCC_VERSION VERSION_LESS "4.1.99")
+         UserWarning("Your GCC is ancient and crashes on some important optimizations.  The full set of SSE2 intrinsics is not supported.  Vc will fall back to the scalar implementation.  Use of the may_alias and always_inline attributes will be disabled.  In turn all code using Vc must be compiled with -fno-strict-aliasing")
+         vc_add_compiler_flag(Vc_DEFINITIONS "-fno-strict-aliasing")
+         set(Vc_AVX_INTRINSICS_BROKEN true)
+         set(Vc_SSE_INTRINSICS_BROKEN true)
+      elseif(Vc_GCC_VERSION VERSION_LESS "4.4.6")
          UserWarning("Your GCC is older than 4.4.6. This is known to cause problems/bugs. Please update to the latest GCC if you can.")
          set(Vc_AVX_INTRINSICS_BROKEN true)
          if(Vc_GCC_VERSION VERSION_LESS "4.3.0")
@@ -280,6 +291,9 @@ macro(vc_set_preferred_compiler_flags)
 
       # get rid of the min/max macros
       set(Vc_DEFINITIONS "${Vc_DEFINITIONS} -DNOMINMAX")
+
+      # MSVC doesn't implement the XOP intrinsics
+      set(Vc_XOP_INTRINSICS_BROKEN true)
    elseif(Vc_COMPILER_IS_CLANG)
       # for now I don't know of any arguments I want to pass. -march and stuff is tried by OptimizeForArchitecture...
 
index ec91b6f..63668bf 100644 (file)
@@ -67,7 +67,7 @@ namespace
 
 namespace std
 {
-inline std::ostream &operator<<(std::ostream &out, const AnsiColor::Type &c)
+static inline std::ostream &operator<<(std::ostream &out, const AnsiColor::Type &c)
 {
     if (mayUseColor(out)) {
         out << c.data;
@@ -76,7 +76,7 @@ inline std::ostream &operator<<(std::ostream &out, const AnsiColor::Type &c)
 }
 
 template<typename T>
-inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector<T> &v)
+static std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector<T> &v)
 {
     out << AnsiColor::green << "[";
     out << v[0];
@@ -87,8 +87,7 @@ inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vecto
     return out;
 }
 
-template<>
-inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector<char> &v)
+static std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector<char> &v)
 {
     out << AnsiColor::green << "[";
     out << int(v[0]);
@@ -98,8 +97,7 @@ inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vecto
     out << "]" << AnsiColor::normal;
     return out;
 }
-template<>
-inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector<unsigned char> &v)
+static std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vector<unsigned char> &v)
 {
     out << AnsiColor::green << "[";
     out << int(v[0]);
@@ -112,7 +110,7 @@ inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vecto
 
 #ifdef VC_HAVE_FMA
 template<typename T>
-inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::VectorMultiplication<T> &v)
+static std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::VectorMultiplication<T> &v)
 {
     return out << VECTOR_NAMESPACE::Vector<T>(v);
 }
@@ -120,10 +118,10 @@ inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Vecto
 
 #ifdef VC_IMPL_AVX
 template<unsigned int VectorSize, size_t RegisterWidth>
-inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Mask<VectorSize, RegisterWidth> &m)
+static std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Mask<VectorSize, RegisterWidth> &m)
 #else
 template<unsigned int VectorSize>
-inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Mask<VectorSize> &m)
+static std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Mask<VectorSize> &m)
 #endif
 {
     out << AnsiColor::blue << "m[";
@@ -141,7 +139,7 @@ inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Mask<
     return out;
 }
 #if VC_IMPL_SSE
-inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Float8Mask &m)
+static std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Float8Mask &m)
 {
     out << AnsiColor::blue << "m[";
     for (unsigned int i = 0; i < 8; ++i) {
@@ -160,7 +158,7 @@ inline std::ostream &operator<<(std::ostream &out, const VECTOR_NAMESPACE::Float
 #endif
 
 template<typename V, typename Parent, typename RM>
-inline std::ostream &operator<<(std::ostream &out, const Vc::MemoryBase<V, Parent, 1, RM> &m )
+static std::ostream &operator<<(std::ostream &out, const Vc::MemoryBase<V, Parent, 1, RM> &m )
 {
     out << AnsiColor::blue << "{" << AnsiColor::normal;
     for (unsigned int i = 0; i < m.vectorsCount(); ++i) {
@@ -171,7 +169,7 @@ inline std::ostream &operator<<(std::ostream &out, const Vc::MemoryBase<V, Paren
 }
 
 template<typename V, typename Parent, typename RM>
-inline std::ostream &operator<<(std::ostream &out, const Vc::MemoryBase<V, Parent, 2, RM> &m )
+static std::ostream &operator<<(std::ostream &out, const Vc::MemoryBase<V, Parent, 2, RM> &m )
 {
     out << AnsiColor::blue << "{" << AnsiColor::normal;
     for (size_t i = 0; i < m.rowsCount(); ++i) {
index cd4fc75..a99d6a4 100644 (file)
 #  define VC_IS_LIKELY(x) __builtin_expect(x, 1)
 #  define VC_RESTRICT __restrict__
 #elif defined(__GNUC__)
-#  if defined(VC_OPEN64)
+#  if VC_GCC < 0x40300 || defined(VC_OPEN64)
+// GCC 4.1 and 4.2 ICE on may_alias. Since Open64 uses the GCC 4.2 frontend it has the same problem.
+#    define MAY_ALIAS
+#  else
+#    define MAY_ALIAS __attribute__((__may_alias__))
+#  endif
+#  if VC_GCC < 0x40200
+// GCC 4.1 fails with "sorry unimplemented: inlining failed"
+#    define INTRINSIC __attribute__((__flatten__))
+#  elif VC_GCC < 0x40300 || defined(VC_OPEN64)
+// the GCC 4.2 frontend doesn't know the __artificial__ attribute
 #    define INTRINSIC __attribute__((__flatten__, __always_inline__))
 #  else
 #    define INTRINSIC __attribute__((__flatten__, __always_inline__, __artificial__))
@@ -75,7 +85,6 @@
 #  define PURE __attribute__((__pure__))
 #  define PURE_L
 #  define PURE_R PURE
-#  define MAY_ALIAS __attribute__((__may_alias__))
 #  define ALWAYS_INLINE __attribute__((__always_inline__))
 #  define ALWAYS_INLINE_L
 #  define ALWAYS_INLINE_R ALWAYS_INLINE
index 6732395..ee6d408 100644 (file)
@@ -20,8 +20,6 @@
 #ifndef CPUID_H
 #define CPUID_H
 
-#include <iostream>
-
 namespace Vc
 {
 class CpuId
index 818fe06..ee5237b 100644 (file)
@@ -47,7 +47,9 @@ CpuId::ProcessorType CpuId::s_processorType = CpuId::IntelReserved;
 bool   CpuId::s_noL2orL3 = false;
 
 #ifdef _MSC_VER
+} // better not include intrin.h inside the Vc namespace :)
 #include <intrin.h>
+namespace Vc {
 #define CPUID(leaf) \
     do { \
         int out[4]; \