From e768d96c74579c79e184027775e51b08cd77fe45 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 22 Oct 2018 10:54:44 -0400 Subject: CUDA: Filter out host link flags during device linking Since commit v3.12.0-rc1~278^2 (CUDA: Pass more link libraries to device linking, 2018-03-27) we consider every link item during device linking. However, items that start in `-` may be host-specific link flags that nvcc will not understand during device linking. Filter such items using a white list. In particular, this allows `-pthread` to be used for host linking while not polluting the device link line. Issue: #18008 --- Source/cmLinkLineDeviceComputer.cxx | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Source/cmLinkLineDeviceComputer.cxx b/Source/cmLinkLineDeviceComputer.cxx index 557fa41..470f394 100644 --- a/Source/cmLinkLineDeviceComputer.cxx +++ b/Source/cmLinkLineDeviceComputer.cxx @@ -23,6 +23,23 @@ cmLinkLineDeviceComputer::~cmLinkLineDeviceComputer() { } +static bool cmLinkItemValidForDevice(std::string const& item) +{ + // Valid items are: + // * Non-flags (does not start in '-') + // * Specific flags --library, --library-path, -l, -L + // For example: + // * 'cublas_device' => pass-along + // * '--library pthread' => pass-along + // * '-lpthread' => pass-along + // * '-pthread' => drop + // * '-a' => drop + return (!cmHasLiteralPrefix(item, "-") || // + cmHasLiteralPrefix(item, "-l") || // + cmHasLiteralPrefix(item, "-L") || // + cmHasLiteralPrefix(item, "--library")); +} + std::string cmLinkLineDeviceComputer::ComputeLinkLibraries( cmComputeLinkInformation& cli, std::string const& stdLibString) { @@ -60,7 +77,7 @@ std::string cmLinkLineDeviceComputer::ComputeLinkLibraries( } fout << this->ConvertToOutputFormat( this->ConvertToLinkReference(item.Value)); - } else { + } else if (cmLinkItemValidForDevice(item.Value)) { fout << item.Value; } fout << " "; -- cgit v0.12 From 83c13ca44f661ba22acf4abe63d84fd5651b4dbc Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Mon, 22 Oct 2018 09:54:54 -0400 Subject: FindThreads: Pass -pthread to CUDA compiler through -Xcompiler Fixes: #18008 --- Modules/FindThreads.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Modules/FindThreads.cmake b/Modules/FindThreads.cmake index a0148dd..75e83ea 100644 --- a/Modules/FindThreads.cmake +++ b/Modules/FindThreads.cmake @@ -208,7 +208,9 @@ if(THREADS_FOUND AND NOT TARGET Threads::Threads) add_library(Threads::Threads INTERFACE IMPORTED) if(THREADS_HAVE_PTHREAD_ARG) - set_property(TARGET Threads::Threads PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread") + set_property(TARGET Threads::Threads + PROPERTY INTERFACE_COMPILE_OPTIONS "$<$:SHELL:-Xcompiler -pthread>" + "$<$>:-pthread>") endif() if(CMAKE_THREAD_LIBS_INIT) -- cgit v0.12 From 2cc050b53b4afb1ed62621360b860e25b7c46015 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 22 Oct 2018 10:54:44 -0400 Subject: CUDA: Add test for device linking when host linking uses threads Convert the `CudaOnly.LinkSystemDeviceLibraries` test to a new `Cuda.ProperDeviceLibraries` test. The former covered only the `cublas_device` library which is removed by CUDA 10. Extend the new test to also cover various cases of using threads. Issue: #18008 --- Tests/Cuda/CMakeLists.txt | 1 + Tests/Cuda/ProperDeviceLibraries/CMakeLists.txt | 45 +++++++++++ Tests/Cuda/ProperDeviceLibraries/main.cu | 92 ++++++++++++++++++++++ Tests/Cuda/ProperDeviceLibraries/use_pthreads.cu | 9 +++ Tests/Cuda/ProperDeviceLibraries/use_pthreads.cxx | 9 +++ Tests/CudaOnly/CMakeLists.txt | 1 - .../LinkSystemDeviceLibraries/CMakeLists.txt | 15 ---- Tests/CudaOnly/LinkSystemDeviceLibraries/main.cu | 83 ------------------- 8 files changed, 156 insertions(+), 99 deletions(-) create mode 100644 Tests/Cuda/ProperDeviceLibraries/CMakeLists.txt create mode 100644 Tests/Cuda/ProperDeviceLibraries/main.cu create mode 100644 Tests/Cuda/ProperDeviceLibraries/use_pthreads.cu create mode 100644 Tests/Cuda/ProperDeviceLibraries/use_pthreads.cxx delete mode 100644 Tests/CudaOnly/LinkSystemDeviceLibraries/CMakeLists.txt delete mode 100644 Tests/CudaOnly/LinkSystemDeviceLibraries/main.cu diff --git a/Tests/Cuda/CMakeLists.txt b/Tests/Cuda/CMakeLists.txt index 8a43df5..1b3daa6 100644 --- a/Tests/Cuda/CMakeLists.txt +++ b/Tests/Cuda/CMakeLists.txt @@ -4,5 +4,6 @@ ADD_TEST_MACRO(Cuda.ConsumeCompileFeatures CudaConsumeCompileFeatures) ADD_TEST_MACRO(Cuda.ObjectLibrary CudaObjectLibrary) ADD_TEST_MACRO(Cuda.MixedStandardLevels MixedStandardLevels) ADD_TEST_MACRO(Cuda.ToolkitInclude CudaToolkitInclude) +ADD_TEST_MACRO(Cuda.ProperDeviceLibraries ProperDeviceLibraries) ADD_TEST_MACRO(Cuda.ProperLinkFlags ProperLinkFlags) ADD_TEST_MACRO(Cuda.WithC CudaWithC) diff --git a/Tests/Cuda/ProperDeviceLibraries/CMakeLists.txt b/Tests/Cuda/ProperDeviceLibraries/CMakeLists.txt new file mode 100644 index 0000000..cb47b09 --- /dev/null +++ b/Tests/Cuda/ProperDeviceLibraries/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.13) +project(ProperDeviceLibraries CXX CUDA) + +string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_35,code=compute_35 -gencode arch=compute_35,code=sm_35") +set(CMAKE_CUDA_STANDARD 11) + +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads) + +add_executable(ProperDeviceLibraries main.cu) +set_target_properties(ProperDeviceLibraries + PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + +add_library(UseThreadsMixed SHARED use_pthreads.cxx use_pthreads.cu) +target_link_libraries(UseThreadsMixed Threads::Threads) + +add_library(UseThreadsCuda SHARED use_pthreads.cu) +target_link_libraries(UseThreadsCuda Threads::Threads) + +target_link_libraries(ProperDeviceLibraries PRIVATE UseThreadsMixed UseThreadsCuda) + +if(THREADS_HAVE_PTHREAD_ARG AND CMAKE_USE_PTHREADS_INIT) + add_library(UseExplicitPThreadsFlag SHARED use_pthreads.cu) + target_compile_options(UseExplicitPThreadsFlag PUBLIC "-Xcompiler=-pthread") + target_link_libraries(UseExplicitPThreadsFlag PUBLIC "-pthread") + + add_library(UseExplicitLThreadsFlag SHARED use_pthreads.cu) + target_compile_options(UseExplicitLThreadsFlag PUBLIC "-Xcompiler=-pthread") + target_link_libraries(UseExplicitLThreadsFlag PUBLIC "-lpthread") + + add_library(UseExplicitLongThreadsFlag SHARED use_pthreads.cu) + target_link_libraries(UseExplicitLongThreadsFlag PUBLIC "--library pthread") + + target_link_libraries(ProperDeviceLibraries PRIVATE UseExplicitPThreadsFlag UseExplicitLThreadsFlag UseExplicitLongThreadsFlag) +endif() + +if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.0.0) + #CUDA 10 removed the cublas_device library + target_link_libraries(ProperDeviceLibraries PRIVATE cublas_device) +endif() + +if(APPLE) + # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime. + set_property(TARGET ProperDeviceLibraries PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) +endif() diff --git a/Tests/Cuda/ProperDeviceLibraries/main.cu b/Tests/Cuda/ProperDeviceLibraries/main.cu new file mode 100644 index 0000000..8ceb0cc --- /dev/null +++ b/Tests/Cuda/ProperDeviceLibraries/main.cu @@ -0,0 +1,92 @@ + +#include +#include +#include + +#if defined(USE_THREADS_POSIX) && defined(HAVE_PTHREAD_H) + +# include +static int verify_linking_to_pthread() +{ + return static_cast(pthread_self()); +} +#endif + +// this test only makes sense for versions of CUDA that ships +// static libraries that have separable compilation device symbols +#if __CUDACC_VER_MAJOR__ <= 9 +__global__ void deviceCublasSgemm(int n, float alpha, float beta, + const float* d_A, const float* d_B, + float* d_C) +{ + cublasHandle_t cnpHandle; + cublasStatus_t status = cublasCreate(&cnpHandle); + + if (status != CUBLAS_STATUS_SUCCESS) { + return; + } + + // Call function defined in the cublas_device system static library. + // This way we can verify that we properly pass system libraries to the + // device link line + status = cublasSgemm(cnpHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, + d_A, n, d_B, n, &beta, d_C, n); + + cublasDestroy(cnpHandle); +} +#endif + +int choose_cuda_device() +{ + int nDevices = 0; + cudaError_t err = cudaGetDeviceCount(&nDevices); + if (err != cudaSuccess) { + std::cerr << "Failed to retrieve the number of CUDA enabled devices" + << std::endl; + return 1; + } + for (int i = 0; i < nDevices; ++i) { + cudaDeviceProp prop; + cudaError_t err = cudaGetDeviceProperties(&prop, i); + if (err != cudaSuccess) { + std::cerr << "Could not retrieve properties from CUDA device " << i + << std::endl; + return 1; + } + + if (prop.major > 3 || (prop.major == 3 && prop.minor >= 5)) { + err = cudaSetDevice(i); + if (err != cudaSuccess) { + std::cout << "Could not select CUDA device " << i << std::endl; + } else { + return 0; + } + } + } + + std::cout << "Could not find a CUDA enabled card supporting compute >=3.5" + << std::endl; + return 1; +} + +int main(int argc, char** argv) +{ + int ret = choose_cuda_device(); + if (ret) { + return 0; + } + +#if __CUDACC_VER_MAJOR__ <= 9 + // initial values that will make sure that the cublasSgemm won't actually + // do any work + int n = 0; + float alpha = 1; + float beta = 1; + float* d_A = nullptr; + float* d_B = nullptr; + float* d_C = nullptr; + deviceCublasSgemm<<<1, 1>>>(n, alpha, beta, d_A, d_B, d_C); +#endif + + return 0; +} diff --git a/Tests/Cuda/ProperDeviceLibraries/use_pthreads.cu b/Tests/Cuda/ProperDeviceLibraries/use_pthreads.cu new file mode 100644 index 0000000..c57b8a8 --- /dev/null +++ b/Tests/Cuda/ProperDeviceLibraries/use_pthreads.cu @@ -0,0 +1,9 @@ + +#if defined(USE_THREADS_POSIX) && defined(HAVE_PTHREAD_H) + +# include +static int verify_linking_to_pthread_cuda() +{ + return static_cast(pthread_self()); +} +#endif diff --git a/Tests/Cuda/ProperDeviceLibraries/use_pthreads.cxx b/Tests/Cuda/ProperDeviceLibraries/use_pthreads.cxx new file mode 100644 index 0000000..dc7c208 --- /dev/null +++ b/Tests/Cuda/ProperDeviceLibraries/use_pthreads.cxx @@ -0,0 +1,9 @@ + +#if defined(USE_THREADS_POSIX) && defined(HAVE_PTHREAD_H) + +# include +static int verify_linking_to_pthread_cxx() +{ + return static_cast(pthread_self()); +} +#endif diff --git a/Tests/CudaOnly/CMakeLists.txt b/Tests/CudaOnly/CMakeLists.txt index 5b7c0e6..9c4f86a 100644 --- a/Tests/CudaOnly/CMakeLists.txt +++ b/Tests/CudaOnly/CMakeLists.txt @@ -3,7 +3,6 @@ ADD_TEST_MACRO(CudaOnly.CircularLinkLine CudaOnlyCircularLinkLine) ADD_TEST_MACRO(CudaOnly.EnableStandard CudaOnlyEnableStandard) ADD_TEST_MACRO(CudaOnly.ExportPTX CudaOnlyExportPTX) ADD_TEST_MACRO(CudaOnly.GPUDebugFlag CudaOnlyGPUDebugFlag) -ADD_TEST_MACRO(CudaOnly.LinkSystemDeviceLibraries CudaOnlyLinkSystemDeviceLibraries) ADD_TEST_MACRO(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols) ADD_TEST_MACRO(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation) ADD_TEST_MACRO(CudaOnly.WithDefs CudaOnlyWithDefs) diff --git a/Tests/CudaOnly/LinkSystemDeviceLibraries/CMakeLists.txt b/Tests/CudaOnly/LinkSystemDeviceLibraries/CMakeLists.txt deleted file mode 100644 index 7f7f606..0000000 --- a/Tests/CudaOnly/LinkSystemDeviceLibraries/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -cmake_minimum_required(VERSION 3.8) -project(LinkSystemDeviceLibraries CUDA) - -string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_35,code=compute_35 -gencode arch=compute_35,code=sm_35") -set(CMAKE_CUDA_STANDARD 11) - -add_executable(CudaOnlyLinkSystemDeviceLibraries main.cu) -set_target_properties( CudaOnlyLinkSystemDeviceLibraries - PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -target_link_libraries( CudaOnlyLinkSystemDeviceLibraries PRIVATE cublas_device) - -if(APPLE) - # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime. - set_property(TARGET CudaOnlyLinkSystemDeviceLibraries PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) -endif() diff --git a/Tests/CudaOnly/LinkSystemDeviceLibraries/main.cu b/Tests/CudaOnly/LinkSystemDeviceLibraries/main.cu deleted file mode 100644 index 2c7c388..0000000 --- a/Tests/CudaOnly/LinkSystemDeviceLibraries/main.cu +++ /dev/null @@ -1,83 +0,0 @@ - -#include -#include -#include - -// this test only makes sense for versions of CUDA that ships -// static libraries that have separable compilation device symbols -#if __CUDACC_VER_MAJOR__ <= 9 -__global__ void deviceCublasSgemm(int n, float alpha, float beta, - const float* d_A, const float* d_B, - float* d_C) -{ - cublasHandle_t cnpHandle; - cublasStatus_t status = cublasCreate(&cnpHandle); - - if (status != CUBLAS_STATUS_SUCCESS) { - return; - } - - // Call function defined in the cublas_device system static library. - // This way we can verify that we properly pass system libraries to the - // device link line - status = cublasSgemm(cnpHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, - d_A, n, d_B, n, &beta, d_C, n); - - cublasDestroy(cnpHandle); -} -#endif - -int choose_cuda_device() -{ - int nDevices = 0; - cudaError_t err = cudaGetDeviceCount(&nDevices); - if (err != cudaSuccess) { - std::cerr << "Failed to retrieve the number of CUDA enabled devices" - << std::endl; - return 1; - } - for (int i = 0; i < nDevices; ++i) { - cudaDeviceProp prop; - cudaError_t err = cudaGetDeviceProperties(&prop, i); - if (err != cudaSuccess) { - std::cerr << "Could not retrieve properties from CUDA device " << i - << std::endl; - return 1; - } - - if (prop.major > 3 || (prop.major == 3 && prop.minor >= 5)) { - err = cudaSetDevice(i); - if (err != cudaSuccess) { - std::cout << "Could not select CUDA device " << i << std::endl; - } else { - return 0; - } - } - } - - std::cout << "Could not find a CUDA enabled card supporting compute >=3.5" - << std::endl; - return 1; -} - -int main(int argc, char** argv) -{ - int ret = choose_cuda_device(); - if (ret) { - return 0; - } - -#if __CUDACC_VER_MAJOR__ <= 9 - // initial values that will make sure that the cublasSgemm won't actually - // do any work - int n = 0; - float alpha = 1; - float beta = 1; - float* d_A = nullptr; - float* d_B = nullptr; - float* d_C = nullptr; - deviceCublasSgemm<<<1, 1>>>(n, alpha, beta, d_A, d_B, d_C); -#endif - - return 0; -} -- cgit v0.12