Merge pull request #1384 from boostorg/1383

mborland · web-flow · commit 625d76783b96 · 2026-04-03T14:57:08.000-04:00
Only enable our CUDA extensions when the user explicitly asks for them
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -820,7 +820,7 @@ jobs:
         run: |
           cd ../boost-root
           mkdir __build__ && cd __build__
-          cmake -DBOOST_INCLUDE_LIBRARIES=$LIBRARY -DBUILD_TESTING=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DBOOST_MATH_ENABLE_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES=86 -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.8 ..
+          cmake -DBOOST_INCLUDE_LIBRARIES=$LIBRARY -DBUILD_TESTING=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DBOOST_MATH_ENABLE_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="75;86" -DCMAKE_CUDA_STANDARD=17 ..
       - name: Build tests
         run: |
           cd ../boost-root/__build__
diff --git a/include/boost/math/special_functions/lanczos.hpp b/include/boost/math/special_functions/lanczos.hpp
@@ -2751,7 +2751,7 @@ struct lanczos
 } // namespace math
 } // namespace boost
 
-#if !defined(_CRAYC) && !defined(__CUDACC__) && (!defined(__GNUC__) || (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ > 3)))
+#if !defined(_CRAYC) && !defined(BOOST_MATH_ENABLE_CUDA) && (!defined(__GNUC__) || (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ > 3)))
 #if ((defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64)) && !defined(_MANAGED) && !defined(BOOST_MATH_HAS_GPU_SUPPORT)
 #include <boost/math/special_functions/detail/lanczos_sse2.hpp>
 #endif
diff --git a/include/boost/math/special_functions/next.hpp b/include/boost/math/special_functions/next.hpp
@@ -25,7 +25,7 @@
 #include <cfloat>
 
 
-#if !defined(_CRAYC) && !defined(__CUDACC__) && (!defined(__GNUC__) || (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ > 3)))
+#if !defined(_CRAYC) && !defined(BOOST_MATH_ENABLE_CUDA) && (!defined(__GNUC__) || (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ > 3)))
 #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(__SSE2__)
 #include "xmmintrin.h"
 #define BOOST_MATH_CHECK_SSE2
diff --git a/include/boost/math/special_functions/sign.hpp b/include/boost/math/special_functions/sign.hpp
@@ -14,7 +14,7 @@
 #pragma once
 #endif
 
-#ifndef __CUDACC_RTC__
+#ifndef BOOST_MATH_HAS_NVRTC
 
 #include <boost/math/tools/config.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
@@ -234,7 +234,7 @@ BOOST_MATH_GPU_ENABLED T sign(T z)
 } // namespace math
 } // namespace boost
 
-#endif // __CUDACC_RTC__
+#endif // BOOST_MATH_HAS_NVRTC
 
 #endif // BOOST_MATH_TOOLS_SIGN_HPP
 
diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp
@@ -11,7 +11,7 @@
 #pragma once
 #endif
 
-#ifndef __CUDACC_RTC__
+#if !(defined(__CUDACC_RTC__) && defined(BOOST_MATH_ENABLE_NVRTC))
 
 #include <boost/math/tools/is_standalone.hpp>
 
@@ -168,7 +168,7 @@
 #    define BOOST_MATH_NOINLINE __declspec(noinline)
 #  elif defined(__GNUC__) && __GNUC__ > 3
      // Clang also defines __GNUC__ (as 4)
-#    if defined(__CUDACC__)
+#    if defined(__CUDACC__) && defined(BOOST_MATH_ENABLE_CUDA)
        // nvcc doesn't always parse __noinline__,
        // see: https://svn.boost.org/trac/boost/ticket/9392
 #      define BOOST_MATH_NOINLINE __attribute__ ((noinline))
@@ -678,7 +678,7 @@ namespace boost{ namespace math{
 // CUDA support:
 //
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) && defined(BOOST_MATH_ENABLE_CUDA)
 
 // We have to get our include order correct otherwise you get compilation failures
 #include <cuda.h>
@@ -774,7 +774,7 @@ BOOST_MATH_GPU_ENABLED constexpr T gpu_safe_max(const T& a, const T& b) { return
 #    define BOOST_MATH_STATIC_LOCAL_VARIABLE
 #  else
 #    define BOOST_MATH_INLINE_CONSTEXPR constexpr
-#    define BOOST_MATH_STATIC constexpr
+#    define BOOST_MATH_STATIC static
 #    define BOOST_MATH_STATIC_LOCAL_VARIABLE static
 #  endif
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -10,13 +10,13 @@ if(HAVE_BOOST_TEST)
 
         message(STATUS "Building boost.math with CUDA")
 
-        find_package(CUDA REQUIRED)
         enable_language(CUDA)
+        find_package(CUDAToolkit REQUIRED)
         set(CMAKE_CUDA_EXTENSIONS OFF)
 
         enable_testing()
 
-        boost_test_jamfile(FILE cuda_jamfile LINK_LIBRARIES Boost::math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::throw_exception Boost::unit_test_framework ${CUDA_LIBRARIES} INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS}  )
+        boost_test_jamfile(FILE cuda_jamfile LINK_LIBRARIES Boost::math Boost::assert Boost::concept_check Boost::config Boost::core Boost::integer Boost::lexical_cast Boost::multiprecision Boost::predef Boost::random Boost::throw_exception Boost::unit_test_framework CUDA::cudart COMPILE_DEFINITIONS BOOST_MATH_ENABLE_CUDA=1  )
 
     elseif (BOOST_MATH_ENABLE_NVRTC)
 
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
@@ -9,6 +9,10 @@ project : requirements
     [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
     ;
 
+# Github Issues
+run github_issue_1383.cu ;
+run github_issue_1383_pt_2.cu ;
+
 # Quad
 run test_exp_sinh_quad_float.cu ;
 run test_exp_sinh_quad_double.cu ;
diff --git a/test/github_issue_1383.cu b/test/github_issue_1383.cu
@@ -0,0 +1,120 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifdef BOOST_MATH_ENABLE_CUDA
+#  undef BOOST_MATH_ENABLE_CUDA
+#endif // BOOST_MATH_ENABLE_CUDA
+
+// Purposefully pull in headers that caused errors in the linked issue
+#include <boost/math/special_functions.hpp>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/constants/constants.hpp>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cmath>
+#include <random>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#ifdef BOOST_MATH_ENABLE_CUDA
+#  error "We should not be enabling this ourselves"
+#endif // BOOST_MATH_ENABLE_CUDA
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    const int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cos(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main()
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    // Check some of our numeric_limits for viability
+    std::mt19937_64 rng {42};
+    std::uniform_real_distribution<float_type> dist(0, boost::math::constants::pi<float_type>());
+    static_assert(boost::math::numeric_limits<float_type>::is_specialized, "Should be since it's a double");
+    static_assert(boost::math::numeric_limits<float_type>::is_signed, "Should be since it's a double");
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(std::cos(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/github_issue_1383_pt_2.cu b/test/github_issue_1383_pt_2.cu
@@ -0,0 +1,116 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+// Purposefully pull in headers that caused errors in the linked issue
+#include <boost/math/special_functions.hpp>
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/constants/constants.hpp>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <cmath>
+#include <random>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    const int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cos(in[i]);
+        if (out[i] > boost::math::numeric_limits<float_type>::max() || !boost::math::numeric_limits<float_type>::is_signed)
+        {
+            __trap();
+        }
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main()
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    // Check some of our numeric_limits for viability
+    std::mt19937_64 rng {42};
+    std::uniform_real_distribution<float_type> dist(0, boost::math::constants::pi<float_type>());
+    static_assert(boost::math::numeric_limits<float_type>::is_specialized, "Should be since it's a double");
+    static_assert(boost::math::numeric_limits<float_type>::is_signed, "Should be since it's a double");
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(std::cos(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}