diff options
41 files changed, 911 insertions, 51 deletions
diff --git a/Help/manual/cmake-properties.7.rst b/Help/manual/cmake-properties.7.rst index 393735e..17208aa 100644 --- a/Help/manual/cmake-properties.7.rst +++ b/Help/manual/cmake-properties.7.rst @@ -173,6 +173,7 @@ Properties on Targets /prop_tgt/CUDA_PTX_COMPILATION /prop_tgt/CUDA_SEPARABLE_COMPILATION /prop_tgt/CUDA_RESOLVE_DEVICE_SYMBOLS + /prop_tgt/CUDA_RUNTIME_LIBRARY /prop_tgt/CUDA_EXTENSIONS /prop_tgt/CUDA_STANDARD /prop_tgt/CUDA_STANDARD_REQUIRED diff --git a/Help/manual/cmake-variables.7.rst b/Help/manual/cmake-variables.7.rst index da2b06e..972dc1b 100644 --- a/Help/manual/cmake-variables.7.rst +++ b/Help/manual/cmake-variables.7.rst @@ -372,6 +372,7 @@ Variables that Control the Build /variable/CMAKE_CTEST_ARGUMENTS /variable/CMAKE_CUDA_SEPARABLE_COMPILATION /variable/CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS + /variable/CMAKE_CUDA_RUNTIME_LIBRARY /variable/CMAKE_DEBUG_POSTFIX /variable/CMAKE_DISABLE_PRECOMPILE_HEADERS /variable/CMAKE_ENABLE_EXPORTS diff --git a/Help/prop_tgt/CUDA_RUNTIME_LIBRARY-VALUES.txt b/Help/prop_tgt/CUDA_RUNTIME_LIBRARY-VALUES.txt new file mode 100644 index 0000000..a6d7050 --- /dev/null +++ b/Help/prop_tgt/CUDA_RUNTIME_LIBRARY-VALUES.txt @@ -0,0 +1,9 @@ +``None`` + Link with ``-cudart=none`` or equivalent flag(s) to use no CUDA + runtime library. +``Shared`` + Link with ``-cudart=shared`` or equivalent flag(s) to use a + dynamically-linked CUDA runtime library. +``Static`` + Link with ``-cudart=static`` or equivalent flag(s) to use a + statically-linked CUDA runtime library. diff --git a/Help/prop_tgt/CUDA_RUNTIME_LIBRARY.rst b/Help/prop_tgt/CUDA_RUNTIME_LIBRARY.rst new file mode 100644 index 0000000..0782765 --- /dev/null +++ b/Help/prop_tgt/CUDA_RUNTIME_LIBRARY.rst @@ -0,0 +1,21 @@ +CUDA_RUNTIME_LIBRARY +-------------------- + +Select the CUDA runtime library for use by compilers targeting the CUDA language. + +The allowed case insensitive values are: + +.. include:: CUDA_RUNTIME_LIBRARY-VALUES.txt + +Contents of ``CUDA_RUNTIME_LIBRARY`` may use +:manual:`generator expressions <cmake-generator-expressions(7)>`. + +If this property is not set then CMake uses the default value +``Static`` to select the CUDA runtime library. + +.. note:: + + This property has effect only when the ``CUDA`` language is enabled. To + control the CUDA runtime linking when only using the CUDA SDK with the + ``C`` or ``C++`` language we recommend using the :module:`FindCUDAToolkit` + module. diff --git a/Help/release/dev/cuda-runtime-library.rst b/Help/release/dev/cuda-runtime-library.rst new file mode 100644 index 0000000..0d5b1f6 --- /dev/null +++ b/Help/release/dev/cuda-runtime-library.rst @@ -0,0 +1,7 @@ +cuda-runtime-library +-------------------- + +* The :variable:`CMAKE_CUDA_RUNTIME_LIBRARY` variable and + :prop_tgt:`CUDA_RUNTIME_LIBRARY` target property were introduced to + select the CUDA runtime library used when linking targets that + use CUDA. diff --git a/Help/variable/CMAKE_CUDA_RUNTIME_LIBRARY.rst b/Help/variable/CMAKE_CUDA_RUNTIME_LIBRARY.rst new file mode 100644 index 0000000..ea1c1b8 --- /dev/null +++ b/Help/variable/CMAKE_CUDA_RUNTIME_LIBRARY.rst @@ -0,0 +1,24 @@ +CMAKE_CUDA_RUNTIME_LIBRARY +-------------------------- + +Select the CUDA runtime library for use by compilers targeting the MSVC ABI. +This variable is used to initialize the :prop_tgt:`CUDA_RUNTIME_LIBRARY` +property on all targets as they are created. + +The allowed case insensitive values are: + +.. include:: ../prop_tgt/CUDA_RUNTIME_LIBRARY-VALUES.txt + +Contents of ``CMAKE_CUDA_RUNTIME_LIBRARY`` may use +:manual:`generator expressions <cmake-generator-expressions(7)>`. + +If this variable is not set then the :prop_tgt:`CUDA_RUNTIME_LIBRARY` target +property will not be set automatically. If that property is not set then +CMake uses the default value ``Static`` to select the CUDA runtime library. + +.. note:: + + This property has effect only when the ``CUDA`` language is enabled. To + control the CUDA runtime linking when only using the CUDA SDK with the + ``C`` or ``C++`` language we recommend using the :module:`FindCUDAToolkit` + module. diff --git a/Modules/CMakeTestCUDACompiler.cmake b/Modules/CMakeTestCUDACompiler.cmake index a0f6bc9..d80b55a 100644 --- a/Modules/CMakeTestCUDACompiler.cmake +++ b/Modules/CMakeTestCUDACompiler.cmake @@ -67,6 +67,17 @@ else() set(CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES "${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}") endif() + # Remove the following libraries from CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES and + # CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES + # + # - cudart + # - cudart_static + # - cudadevrt + # + # These are controlled by CMAKE_CUDA_RUNTIME_LIBRARY + list(REMOVE_ITEM CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES cudart cudart_static cudadevrt) + list(REMOVE_ITEM CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES cudart cudart_static cudadevrt) + # Re-configure to save learned information. configure_file( ${CMAKE_ROOT}/Modules/CMakeCUDACompiler.cmake.in diff --git a/Modules/Compiler/NVIDIA-CUDA.cmake b/Modules/Compiler/NVIDIA-CUDA.cmake index fb1fc20..a786fb9 100644 --- a/Modules/Compiler/NVIDIA-CUDA.cmake +++ b/Modules/Compiler/NVIDIA-CUDA.cmake @@ -43,6 +43,11 @@ endif() set(CMAKE_SHARED_LIBRARY_CREATE_CUDA_FLAGS -shared) set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA -isystem=) +set(CMAKE_CUDA_RUNTIME_LIBRARY_DEFAULT "STATIC") +set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_STATIC "cudadevrt;cudart_static") +set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_SHARED "cudadevrt;cudart") +set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_NONE "") + if("x${CMAKE_CUDA_SIMULATE_ID}" STREQUAL "xMSVC") set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "") set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "") diff --git a/Modules/Platform/Windows-NVIDIA-CUDA.cmake b/Modules/Platform/Windows-NVIDIA-CUDA.cmake index 30b5aa9..f809094 100644 --- a/Modules/Platform/Windows-NVIDIA-CUDA.cmake +++ b/Modules/Platform/Windows-NVIDIA-CUDA.cmake @@ -69,6 +69,11 @@ else() endif() unset(_cmp0092) +set(CMAKE_CUDA_RUNTIME_LIBRARY_DEFAULT "STATIC") +set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_STATIC "cudadevrt;cudart_static") +set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_SHARED "cudadevrt;cudart") +set(CMAKE_CUDA_RUNTIME_LIBRARY_LINK_OPTIONS_NONE "") + string(APPEND CMAKE_CUDA_FLAGS_INIT " ${PLATFORM_DEFINES_CUDA} -D_WINDOWS -Xcompiler=\"${_W3}${_FLAGS_CXX}\"") string(APPEND CMAKE_CUDA_FLAGS_DEBUG_INIT " -Xcompiler=\"${_MDd}-Zi -Ob0 -Od ${_RTC1}\"") string(APPEND CMAKE_CUDA_FLAGS_RELEASE_INIT " -Xcompiler=\"${_MD}-O2 -Ob2\" -DNDEBUG") diff --git a/Source/cmComputeLinkInformation.cxx b/Source/cmComputeLinkInformation.cxx index 8773d10..f9f5b72 100644 --- a/Source/cmComputeLinkInformation.cxx +++ b/Source/cmComputeLinkInformation.cxx @@ -10,6 +10,7 @@ #include "cmAlgorithms.h" #include "cmComputeLinkDepends.h" +#include "cmGeneratorExpression.h" #include "cmGeneratorTarget.h" #include "cmGlobalGenerator.h" #include "cmListFileCache.h" @@ -573,6 +574,15 @@ void cmComputeLinkInformation::AddImplicitLinkInfo() cmGeneratorTarget::LinkClosure const* lc = this->Target->GetLinkClosure(this->Config); for (std::string const& li : lc->Languages) { + + if (li == "CUDA") { + // These need to go before the other implicit link information + // as they could require symbols from those other library + // Currently restricted to CUDA as it is the only language + // we have documented runtime behavior controls for + this->AddRuntimeLinkLibrary(li); + } + // Skip those of the linker language. They are implicit. if (li != this->LinkLanguage) { this->AddImplicitLinkInfo(li); @@ -580,6 +590,39 @@ void cmComputeLinkInformation::AddImplicitLinkInfo() } } +void cmComputeLinkInformation::AddRuntimeLinkLibrary(std::string const& lang) +{ // Add the lang runtime library flags. This is activated by the presence + // of a default selection whether or not it is overridden by a property. + std::string defaultVar = + cmStrCat("CMAKE_", lang, "_RUNTIME_LIBRARY_DEFAULT"); + const char* langRuntimeLibraryDefault = + this->Makefile->GetDefinition(defaultVar); + if (langRuntimeLibraryDefault && *langRuntimeLibraryDefault) { + const char* runtimeLibraryValue = + this->Target->GetProperty(cmStrCat(lang, "_RUNTIME_LIBRARY")); + if (!runtimeLibraryValue) { + runtimeLibraryValue = langRuntimeLibraryDefault; + } + + std::string runtimeLibrary = + cmSystemTools::UpperCase(cmGeneratorExpression::Evaluate( + runtimeLibraryValue, this->Target->GetLocalGenerator(), this->Config, + this->Target)); + if (!runtimeLibrary.empty()) { + if (const char* runtimeLinkOptions = this->Makefile->GetDefinition( + "CMAKE_" + lang + "_RUNTIME_LIBRARY_LINK_OPTIONS_" + + runtimeLibrary)) { + std::vector<std::string> libsVec = cmExpandedList(runtimeLinkOptions); + for (std::string const& i : libsVec) { + if (!cmContains(this->ImplicitLinkLibs, i)) { + this->AddItem(i, nullptr); + } + } + } + } + } +} + void cmComputeLinkInformation::AddImplicitLinkInfo(std::string const& lang) { // Add libraries for this language that are not implied by the diff --git a/Source/cmComputeLinkInformation.h b/Source/cmComputeLinkInformation.h index 92ab83b..46f6705 100644 --- a/Source/cmComputeLinkInformation.h +++ b/Source/cmComputeLinkInformation.h @@ -172,6 +172,7 @@ private: void LoadImplicitLinkInfo(); void AddImplicitLinkInfo(); void AddImplicitLinkInfo(std::string const& lang); + void AddRuntimeLinkLibrary(std::string const& lang); std::set<std::string> ImplicitLinkDirs; std::set<std::string> ImplicitLinkLibs; diff --git a/Source/cmTarget.cxx b/Source/cmTarget.cxx index a0b3138..8f0a8e0 100644 --- a/Source/cmTarget.cxx +++ b/Source/cmTarget.cxx @@ -358,6 +358,7 @@ cmTarget::cmTarget(std::string const& name, cmStateEnums::TargetType type, initProp("CUDA_COMPILER_LAUNCHER"); initProp("CUDA_SEPARABLE_COMPILATION"); initProp("CUDA_RESOLVE_DEVICE_SYMBOLS"); + initProp("CUDA_RUNTIME_LIBRARY"); initProp("LINK_SEARCH_START_STATIC"); initProp("LINK_SEARCH_END_STATIC"); initProp("Swift_LANGUAGE_VERSION"); diff --git a/Source/cmVisualStudio10TargetGenerator.cxx b/Source/cmVisualStudio10TargetGenerator.cxx index f707bb4..2a09910 100644 --- a/Source/cmVisualStudio10TargetGenerator.cxx +++ b/Source/cmVisualStudio10TargetGenerator.cxx @@ -3636,18 +3636,7 @@ bool cmVisualStudio10TargetGenerator::ComputeLinkOptions( this->AddLibraries(cli, libVec, vsTargetVec, config); if (cmContains(linkClosure->Languages, "CUDA") && this->CudaOptions[config] != nullptr) { - switch (this->CudaOptions[config]->GetCudaRuntime()) { - case cmVisualStudioGeneratorOptions::CudaRuntimeStatic: - libVec.push_back("cudadevrt.lib"); - libVec.push_back("cudart_static.lib"); - break; - case cmVisualStudioGeneratorOptions::CudaRuntimeShared: - libVec.push_back("cudadevrt.lib"); - libVec.push_back("cudart.lib"); - break; - case cmVisualStudioGeneratorOptions::CudaRuntimeNone: - break; - } + this->CudaOptions[config]->FixCudaRuntime(this->GeneratorTarget); } std::string standardLibsVar = cmStrCat("CMAKE_", linkLanguage, "_STANDARD_LIBRARIES"); diff --git a/Source/cmVisualStudioGeneratorOptions.cxx b/Source/cmVisualStudioGeneratorOptions.cxx index 18c19b7..4004b66 100644 --- a/Source/cmVisualStudioGeneratorOptions.cxx +++ b/Source/cmVisualStudioGeneratorOptions.cxx @@ -3,6 +3,8 @@ #include <cm/iterator> #include "cmAlgorithms.h" +#include "cmGeneratorExpression.h" +#include "cmGeneratorTarget.h" #include "cmLocalVisualStudioGenerator.h" #include "cmOutputConverter.h" #include "cmSystemTools.h" @@ -149,25 +151,33 @@ bool cmVisualStudioGeneratorOptions::UsingSBCS() const return false; } -cmVisualStudioGeneratorOptions::CudaRuntime -cmVisualStudioGeneratorOptions::GetCudaRuntime() const +void cmVisualStudioGeneratorOptions::FixCudaRuntime(cmGeneratorTarget* target) { std::map<std::string, FlagValue>::const_iterator i = this->FlagMap.find("CudaRuntime"); - if (i != this->FlagMap.end() && i->second.size() == 1) { - std::string const& cudaRuntime = i->second[0]; - if (cudaRuntime == "Static") { - return CudaRuntimeStatic; - } - if (cudaRuntime == "Shared") { - return CudaRuntimeShared; - } - if (cudaRuntime == "None") { - return CudaRuntimeNone; + if (i == this->FlagMap.end()) { + // User didn't provide am override so get the property value + const char* runtimeLibraryValue = + target->GetProperty("CUDA_RUNTIME_LIBRARY"); + if (runtimeLibraryValue) { + std::string cudaRuntime = + cmSystemTools::UpperCase(cmGeneratorExpression::Evaluate( + runtimeLibraryValue, this->LocalGenerator, this->Configuration, + target)); + if (cudaRuntime == "STATIC") { + this->AddFlag("CudaRuntime", "Static"); + } + if (cudaRuntime == "SHARED") { + this->AddFlag("CudaRuntime", "Shared"); + } + if (cudaRuntime == "NONE") { + this->AddFlag("CudaRuntime", "None"); + } + } else { + // nvcc default is static + this->AddFlag("CudaRuntime", "Static"); } } - // nvcc default is static - return CudaRuntimeStatic; } void cmVisualStudioGeneratorOptions::FixCudaCodeGeneration() diff --git a/Source/cmVisualStudioGeneratorOptions.h b/Source/cmVisualStudioGeneratorOptions.h index d8dcfe2..b335694 100644 --- a/Source/cmVisualStudioGeneratorOptions.h +++ b/Source/cmVisualStudioGeneratorOptions.h @@ -13,6 +13,7 @@ #include "cmIDEOptions.h" class cmLocalVisualStudioGenerator; +class cmGeneratorTarget; using cmVS7FlagTable = cmIDEFlagTable; @@ -61,15 +62,8 @@ public: bool UsingUnicode() const; bool UsingSBCS() const; - enum CudaRuntime - { - CudaRuntimeStatic, - CudaRuntimeShared, - CudaRuntimeNone - }; - CudaRuntime GetCudaRuntime() const; - void FixCudaCodeGeneration(); + void FixCudaRuntime(cmGeneratorTarget* target); void FixManifestUACFlags(); diff --git a/Tests/Cuda/Complex/CMakeLists.txt b/Tests/Cuda/Complex/CMakeLists.txt index d3d4b7c..08d1e16 100644 --- a/Tests/Cuda/Complex/CMakeLists.txt +++ b/Tests/Cuda/Complex/CMakeLists.txt @@ -22,18 +22,11 @@ set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) set(CMAKE_CXX_STANDARD_REQUIRED TRUE) add_library(CudaComplexCppBase SHARED dynamic.cpp) -add_library(CudaComplexSeperableLib STATIC file1.cu file2.cu file3.cu) -set_target_properties(CudaComplexSeperableLib - PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -set_target_properties( CudaComplexSeperableLib - PROPERTIES POSITION_INDEPENDENT_CODE ON) - add_library(CudaComplexSharedLib SHARED dynamic.cu) target_link_libraries(CudaComplexSharedLib PUBLIC CudaComplexCppBase) +add_library(CudaComplexSeperableLib STATIC file1.cu file2.cu file3.cu) add_library(CudaComplexMixedLib SHARED mixed.cpp mixed.cu) -set_target_properties(CudaComplexMixedLib - PROPERTIES CUDA_SEPARABLE_COMPILATION ON) target_link_libraries(CudaComplexMixedLib PUBLIC CudaComplexSharedLib PRIVATE CudaComplexSeperableLib) @@ -41,7 +34,27 @@ target_link_libraries(CudaComplexMixedLib add_executable(CudaComplex main.cpp) target_link_libraries(CudaComplex PUBLIC CudaComplexMixedLib) + +set_target_properties(CudaComplexMixedLib + CudaComplexSeperableLib + PROPERTIES + POSITION_INDEPENDENT_CODE ON + CUDA_SEPARABLE_COMPILATION ON + ) +set_target_properties(CudaComplexMixedLib + CudaComplexSharedLib + PROPERTIES + CUDA_RUNTIME_LIBRARY shared + ) + + if(APPLE) # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime. set_property(TARGET CudaComplex PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) endif() + +if(UNIX) + # Help the shared cuda runtime find libcudart as it is not located + # in a default system searched location + set_property(TARGET CudaComplexMixedLib PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) +endif() diff --git a/Tests/Cuda/Complex/dynamic.cu b/Tests/Cuda/Complex/dynamic.cu index 9da8853..7f2f2b5 100644 --- a/Tests/Cuda/Complex/dynamic.cu +++ b/Tests/Cuda/Complex/dynamic.cu @@ -54,17 +54,20 @@ EXPORT int choose_cuda_device() return 1; } -EXPORT void cuda_dynamic_lib_func() +EXPORT bool cuda_dynamic_lib_func() { - DetermineIfValidCudaDevice<<<1, 1>>>(); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { - std::cerr << "DetermineIfValidCudaDevice [SYNC] failed: " + std::cerr << "DetermineIfValidCudaDevice [Per Launch] failed: " << cudaGetErrorString(err) << std::endl; + return false; } + DetermineIfValidCudaDevice<<<1, 1>>>(); err = cudaDeviceSynchronize(); if (err != cudaSuccess) { - std::cerr << "DetermineIfValidCudaDevice [ASYNC] failed: " + std::cerr << "DetermineIfValidCudaDevice [SYNC] failed: " << cudaGetErrorString(cudaGetLastError()) << std::endl; + return false; } + return true; } diff --git a/Tests/Cuda/Complex/main.cpp b/Tests/Cuda/Complex/main.cpp index 6ca5952..da09b44 100644 --- a/Tests/Cuda/Complex/main.cpp +++ b/Tests/Cuda/Complex/main.cpp @@ -22,5 +22,6 @@ int main(int argc, char** argv) int r1 = call_cuda_seperable_code(42); int r2 = mixed_launch_kernel(42); + return (r1 == 42 || r2 == 42) ? 1 : 0; } diff --git a/Tests/Cuda/Complex/mixed.cu b/Tests/Cuda/Complex/mixed.cu index 5b85aec..76119ad 100644 --- a/Tests/Cuda/Complex/mixed.cu +++ b/Tests/Cuda/Complex/mixed.cu @@ -15,7 +15,7 @@ result_type __device__ file1_func(int x); result_type_dynamic __device__ file2_func(int x); -IMPORT void __host__ cuda_dynamic_lib_func(); +IMPORT bool __host__ cuda_dynamic_lib_func(); static __global__ void mixed_kernel(result_type* r, int x) { @@ -25,7 +25,9 @@ static __global__ void mixed_kernel(result_type* r, int x) EXPORT int mixed_launch_kernel(int x) { - cuda_dynamic_lib_func(); + if (!cuda_dynamic_lib_func()) { + return x; + } result_type* r; cudaError_t err = cudaMallocManaged(&r, sizeof(result_type)); diff --git a/Tests/CudaOnly/CMakeLists.txt b/Tests/CudaOnly/CMakeLists.txt index a0575cd..cc1ee1a 100644 --- a/Tests/CudaOnly/CMakeLists.txt +++ b/Tests/CudaOnly/CMakeLists.txt @@ -5,10 +5,21 @@ ADD_TEST_MACRO(CudaOnly.ExportPTX CudaOnlyExportPTX) ADD_TEST_MACRO(CudaOnly.GPUDebugFlag CudaOnlyGPUDebugFlag) ADD_TEST_MACRO(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols) ADD_TEST_MACRO(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation) +ADD_TEST_MACRO(CudaOnly.SharedRuntimePlusToolkit CudaOnlySharedRuntimePlusToolkit) ADD_TEST_MACRO(CudaOnly.Standard98 CudaOnlyStandard98) ADD_TEST_MACRO(CudaOnly.Toolkit CudaOnlyToolkit) ADD_TEST_MACRO(CudaOnly.WithDefs CudaOnlyWithDefs) +# The CUDA only ships the shared version of the toolkit libraries +# on windows +if(NOT WIN32) + ADD_TEST_MACRO(Cuda.StaticRuntimePlusToolkit StaticRuntimePlusToolkit) +endif() + +if(MSVC) + ADD_TEST_MACRO(CudaOnly.PDB CudaOnlyPDB) +endif() + add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND ${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION> --build-and-test @@ -20,6 +31,14 @@ add_test(NAME CudaOnly.DontResolveDeviceSymbols COMMAND --test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION> ) -if(MSVC) - ADD_TEST_MACRO(CudaOnly.PDB CudaOnlyPDB) -endif() +add_test(NAME CudaOnly.RuntimeControls COMMAND + ${CMAKE_CTEST_COMMAND} -C $<CONFIGURATION> + --build-and-test + "${CMAKE_CURRENT_SOURCE_DIR}/RuntimeControls/" + "${CMAKE_CURRENT_BINARY_DIR}/RuntimeControls/" + --build-two-config + ${build_generator_args} + --build-project RuntimeControls + --build-options ${build_options} + --test-command ${CMAKE_CTEST_COMMAND} -V -C $<CONFIGURATION> + ) diff --git a/Tests/CudaOnly/RuntimeControls/CMakeLists.txt b/Tests/CudaOnly/RuntimeControls/CMakeLists.txt new file mode 100644 index 0000000..8b58fec --- /dev/null +++ b/Tests/CudaOnly/RuntimeControls/CMakeLists.txt @@ -0,0 +1,60 @@ +cmake_minimum_required(VERSION 3.7) +project (RuntimeControls CUDA) + +# Find nm and dumpbin +if(CMAKE_NM) + set(dump_command ${CMAKE_NM}) + set(dump_args -g) +else() + include(GetPrerequisites) + message(STATUS "calling list_prerequisites to find dumpbin") + list_prerequisites("${CMAKE_COMMAND}" 0 0 0) + if(gp_dumpbin) + set(dump_command ${gp_dumpbin}) + set(dump_args /ARCHIVEMEMBERS) + endif() +endif() + +string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_30,code=[compute_30]") + +set(CMAKE_CUDA_STANDARD 11) +set(CMAKE_CUDA_RUNTIME_LIBRARY static) + +if(NOT "x${CMAKE_CUDA_SIMULATE_ID}" STREQUAL "xMSVC") + add_library(UsesNoCudaRT SHARED file1.cu) + set_target_properties(UsesNoCudaRT PROPERTIES CUDA_RUNTIME_LIBRARY none) +endif() + +add_library(UsesStaticCudaRT SHARED file2.cu) + +add_executable(CudaOnlyRuntimeControls main.cu) +set_target_properties(CudaOnlyRuntimeControls PROPERTIES CUDA_RUNTIME_LIBRARY shared) + +target_link_libraries(CudaOnlyRuntimeControls PRIVATE $<TARGET_NAME_IF_EXISTS:UsesNoCudaRT> UsesStaticCudaRT) + + +if(dump_command) + if(TARGET UsesNoCudaRT) + add_custom_command(TARGET UsesNoCudaRT POST_BUILD + COMMAND ${CMAKE_COMMAND} + -DDUMP_COMMAND=${dump_command} + -DDUMP_ARGS=${dump_args} + -DTEST_LIBRARY_PATH=$<TARGET_FILE:UsesNoCudaRT> + -P ${CMAKE_CURRENT_SOURCE_DIR}/no_runtime.cmake + ) + endif() + add_custom_command(TARGET UsesStaticCudaRT POST_BUILD + COMMAND ${CMAKE_COMMAND} + -DDUMP_COMMAND=${dump_command} + -DDUMP_ARGS=${dump_args} + -DTEST_LIBRARY_PATH=$<TARGET_FILE:UsesStaticCudaRT> + -P ${CMAKE_CURRENT_SOURCE_DIR}/uses_static_runtime.cmake + ) + string(REPLACE ";" "|" dirs "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}") + add_custom_command(TARGET CudaOnlyRuntimeControls POST_BUILD + COMMAND ${CMAKE_COMMAND} + -DEXEC_PATH=$<TARGET_FILE:CudaOnlyRuntimeControls> + -DEXTRA_LIB_DIRS="${dirs}" + -P ${CMAKE_CURRENT_SOURCE_DIR}/verify_runtime.cmake + ) +endif() diff --git a/Tests/CudaOnly/RuntimeControls/file1.cu b/Tests/CudaOnly/RuntimeControls/file1.cu new file mode 100644 index 0000000..28beb5e --- /dev/null +++ b/Tests/CudaOnly/RuntimeControls/file1.cu @@ -0,0 +1,18 @@ + +#ifdef _WIN32 +# define EXPORT __declspec(dllexport) +#else +# define EXPORT +#endif + +void __global__ file1_kernel(int x, int& r) +{ + r = -x; +} + +EXPORT int file1_launch_kernel(int x) +{ + int r = 0; + file1_kernel<<<1, 1>>>(x, r); + return r; +} diff --git a/Tests/CudaOnly/RuntimeControls/file2.cu b/Tests/CudaOnly/RuntimeControls/file2.cu new file mode 100644 index 0000000..ff68a70 --- /dev/null +++ b/Tests/CudaOnly/RuntimeControls/file2.cu @@ -0,0 +1,18 @@ + +#ifdef _WIN32 +# define EXPORT __declspec(dllexport) +#else +# define EXPORT +#endif + +void __global__ file2_kernel(int x, int& r) +{ + r = -x; +} + +EXPORT int file2_launch_kernel(int x) +{ + int r = 0; + file2_kernel<<<1, 1>>>(x, r); + return r; +} diff --git a/Tests/CudaOnly/RuntimeControls/main.cu b/Tests/CudaOnly/RuntimeControls/main.cu new file mode 100644 index 0000000..0be22af --- /dev/null +++ b/Tests/CudaOnly/RuntimeControls/main.cu @@ -0,0 +1,81 @@ + +#include <iostream> + +#include "cuda.h" + +#ifdef _WIN32 +# define IMPORT __declspec(dllimport) +#else +# define IMPORT +#endif + +#ifndef _WIN32 +IMPORT int file1_launch_kernel(int x); +#endif + +IMPORT int file2_launch_kernel(int x); + +int choose_cuda_device() +{ + int nDevices = 0; + cudaError_t err = cudaGetDeviceCount(&nDevices); + if (err != cudaSuccess) { + std::cerr << "Failed to retrieve the number of CUDA enabled devices" + << std::endl; + return 1; + } + for (int i = 0; i < nDevices; ++i) { + cudaDeviceProp prop; + cudaError_t err = cudaGetDeviceProperties(&prop, i); + if (err != cudaSuccess) { + std::cerr << "Could not retrieve properties from CUDA device " << i + << std::endl; + return 1; + } + std::cout << "prop.major: " << prop.major << std::endl; + if (prop.major >= 3) { + err = cudaSetDevice(i); + if (err != cudaSuccess) { + std::cout << "Could not select CUDA device " << i << std::endl; + } else { + return 0; + } + } + } + + std::cout << "Could not find a CUDA enabled card supporting compute >=3.0" + << std::endl; + + return 1; +} + +int main(int argc, char** argv) +{ + int ret = choose_cuda_device(); + if (ret) { + return 0; + } + + cudaError_t err; +#ifndef _WIN32 + file1_launch_kernel(1); + err = cudaGetLastError(); + if (err != cudaSuccess) { + std::cerr << "file1_launch_kernel: kernel launch should have passed.\n " + "Error message: " + << cudaGetErrorString(err) << std::endl; + return 1; + } +#endif + + file2_launch_kernel(1); + err = cudaGetLastError(); + if (err != cudaSuccess) { + std::cerr << "file2_launch_kernel: kernel launch should have passed.\n " + "Error message: " + << cudaGetErrorString(err) << std::endl; + return 1; + } + + return 0; +} diff --git a/Tests/CudaOnly/RuntimeControls/no_runtime.cmake b/Tests/CudaOnly/RuntimeControls/no_runtime.cmake new file mode 100644 index 0000000..55f28cc --- /dev/null +++ b/Tests/CudaOnly/RuntimeControls/no_runtime.cmake @@ -0,0 +1,14 @@ +execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH} + RESULT_VARIABLE RESULT + OUTPUT_VARIABLE OUTPUT + ERROR_VARIABLE ERROR +) + +if(NOT "${RESULT}" STREQUAL "0") + message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]") +endif() + +if(NOT "${OUTPUT}" MATCHES "(__cuda)") + message(FATAL_ERROR + "not missing cuda device symbols, static runtime linking was used.") +endif() diff --git a/Tests/CudaOnly/RuntimeControls/uses_static_runtime.cmake b/Tests/CudaOnly/RuntimeControls/uses_static_runtime.cmake new file mode 100644 index 0000000..b372fea --- /dev/null +++ b/Tests/CudaOnly/RuntimeControls/uses_static_runtime.cmake @@ -0,0 +1,14 @@ +execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH} + RESULT_VARIABLE RESULT + OUTPUT_VARIABLE OUTPUT + ERROR_VARIABLE ERROR +) + +if(NOT "${RESULT}" STREQUAL "0") + message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]") +endif() + +if("${OUTPUT}" MATCHES "__cuda") + message(FATAL_ERROR + "missing cuda device symbols, static runtime linking was not used.") +endif() diff --git a/Tests/CudaOnly/RuntimeControls/verify_runtime.cmake b/Tests/CudaOnly/RuntimeControls/verify_runtime.cmake new file mode 100644 index 0000000..b313dac --- /dev/null +++ b/Tests/CudaOnly/RuntimeControls/verify_runtime.cmake @@ -0,0 +1,16 @@ + +string(REPLACE "|" ";" dirs "${EXTRA_LIB_DIRS}") +file(GET_RUNTIME_DEPENDENCIES + RESOLVED_DEPENDENCIES_VAR resolved_libs + UNRESOLVED_DEPENDENCIES_VAR unresolved_libs + DIRECTORIES ${dirs} + EXECUTABLES ${EXEC_PATH} + ) + +list(FILTER resolved_libs INCLUDE REGEX ".*cudart.*") +list(LENGTH resolved_libs has_cudart) + +if(has_cudart EQUAL 0) + message(FATAL_ERROR + "missing cudart shared library from runtime dependency output.") +endif() diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt b/Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt new file mode 100644 index 0000000..03fba22 --- /dev/null +++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/CMakeLists.txt @@ -0,0 +1,42 @@ +cmake_minimum_required(VERSION 3.15) +project(SharedRuntimePlusToolkit CUDA) + +#Goal for this example: +# Validate that with c++ we can use some components of the CUDA toolkit, and +# specify the cuda runtime +find_package(CUDAToolkit REQUIRED) + +add_library(Common OBJECT curand.cu nppif.cu) +target_link_libraries(Common PRIVATE CUDA::toolkit) +set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON) + +#shared runtime with shared toolkit libraries +add_library(SharedToolkit SHARED shared.cu) +target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif) +set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none) +target_link_libraries(SharedToolkit PUBLIC CUDA::cudart) + +# The CUDA only ships the shared version of the toolkit libraries +# on windows +if(NOT WIN32) + #shared runtime with static toolkit libraries + add_library(StaticToolkit SHARED static.cu) + target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static) + set_target_properties(StaticToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared) + + #static runtime with mixed toolkit libraries + add_library(MixedToolkit SHARED mixed.cu) + target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand_static CUDA::nppif) + set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Shared) +endif() + +add_executable(CudaOnlySharedRuntimePlusToolkit main.cu) +target_link_libraries(CudaOnlySharedRuntimePlusToolkit PRIVATE SharedToolkit + $<TARGET_NAME_IF_EXISTS:StaticToolkit> + $<TARGET_NAME_IF_EXISTS:MixedToolkit>) + +if(UNIX) + # Help the shared cuda runtime find libcudart as it is not located + # in a default system searched location + set_property(TARGET CudaOnlySharedRuntimePlusToolkit PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) +endif() diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu new file mode 100644 index 0000000..fdd7b53 --- /dev/null +++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/curand.cu @@ -0,0 +1,65 @@ +// Comes from: +// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example + +#ifdef _WIN32 +# define EXPORT __declspec(dllexport) +#else +# define EXPORT +#endif + +/* + * This program uses the host CURAND API to generate 100 + * pseudorandom floats. + */ +#include <cuda.h> +#include <curand.h> +#include <stdio.h> +#include <stdlib.h> + +#define CUDA_CALL(x) \ + do { \ + if ((x) != cudaSuccess) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + return EXIT_FAILURE; \ + } \ + } while (0) +#define CURAND_CALL(x) \ + do { \ + if ((x) != CURAND_STATUS_SUCCESS) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + return EXIT_FAILURE; \ + } \ + } while (0) + +EXPORT int curand_main() +{ + size_t n = 100; + size_t i; + curandGenerator_t gen; + float *devData, *hostData; + + /* Allocate n floats on host */ + hostData = (float*)calloc(n, sizeof(float)); + + /* Allocate n floats on device */ + CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float))); + + /* Create pseudo-random number generator */ + CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); + + /* Set seed */ + CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL)); + + /* Generate n floats on device */ + CURAND_CALL(curandGenerateUniform(gen, devData, n)); + + /* Copy device memory to host */ + CUDA_CALL( + cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost)); + + /* Cleanup */ + CURAND_CALL(curandDestroyGenerator(gen)); + CUDA_CALL(cudaFree(devData)); + free(hostData); + return EXIT_SUCCESS; +} diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu new file mode 100644 index 0000000..2a4da22 --- /dev/null +++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/main.cu @@ -0,0 +1,23 @@ + +#ifdef _WIN32 +# define IMPORT __declspec(dllimport) +IMPORT int shared_version(); +int static_version() +{ + return 0; +} +int mixed_version() +{ + return 0; +} +#else +int shared_version(); +int static_version(); +int mixed_version(); +#endif + +int main() +{ + return mixed_version() == 0 && shared_version() == 0 && + static_version() == 0; +} diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu new file mode 100644 index 0000000..6de6886 --- /dev/null +++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/mixed.cu @@ -0,0 +1,16 @@ + +#ifdef _WIN32 +# define IMPORT __declspec(dllimport) +# define EXPORT __declspec(dllexport) +#else +# define IMPORT +# define EXPORT +#endif + +IMPORT int curand_main(); +IMPORT int nppif_main(); + +EXPORT int mixed_version() +{ + return curand_main() == 0 && nppif_main() == 0; +} diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu new file mode 100644 index 0000000..ac5341c --- /dev/null +++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/nppif.cu @@ -0,0 +1,92 @@ +// Comes from +// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066 + +#ifdef _WIN32 +# define EXPORT __declspec(dllexport) +#else +# define EXPORT +#endif + +#include <cstdio> +#include <iostream> + +#include <assert.h> +#include <cuda_runtime_api.h> +#include <nppi_filtering_functions.h> + +EXPORT int nppif_main() +{ + /** + * 8-bit unsigned single-channel 1D row convolution. + */ + const int simgrows = 32; + const int simgcols = 32; + Npp8u *d_pSrc, *d_pDst; + const int nMaskSize = 3; + NppiSize oROI; + oROI.width = simgcols - nMaskSize; + oROI.height = simgrows; + const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]); + const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]); + const int simgpix = simgrows * simgcols; + const int dimgpix = oROI.width * oROI.height; + const int nSrcStep = simgcols * sizeof(d_pSrc[0]); + const int nDstStep = oROI.width * sizeof(d_pDst[0]); + const int pixval = 1; + const int nDivisor = 1; + const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval }; + Npp32s* d_pKernel; + const Npp32s nAnchor = 2; + cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + err = cudaMalloc((void**)&d_pDst, dimgsize); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0])); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + // set image to pixval initially + err = cudaMemset(d_pSrc, pixval, simgsize); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + err = cudaMemset(d_pDst, 0, dimgsize); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]), + cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + // copy src to dst + NppStatus ret = + nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel, + nMaskSize, nAnchor, nDivisor); + assert(ret == NPP_NO_ERROR); + Npp8u* h_imgres = new Npp8u[dimgpix]; + err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + // test for filtering + for (int i = 0; i < dimgpix; i++) { + if (h_imgres[i] != (pixval * pixval * nMaskSize)) { + fprintf(stderr, "h_imgres at index %d failed to match\n", i); + return 1; + } + } + + return 0; +} diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu new file mode 100644 index 0000000..f3c3dbc --- /dev/null +++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/shared.cu @@ -0,0 +1,16 @@ + +#ifdef _WIN32 +# define IMPORT __declspec(dllimport) +# define EXPORT __declspec(dllexport) +#else +# define IMPORT +# define EXPORT +#endif + +int curand_main(); +int nppif_main(); + +EXPORT int shared_version() +{ + return curand_main() == 0 && nppif_main() == 0; +} diff --git a/Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu b/Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu new file mode 100644 index 0000000..6932fa3 --- /dev/null +++ b/Tests/CudaOnly/SharedRuntimePlusToolkit/static.cu @@ -0,0 +1,16 @@ + +#ifdef _WIN32 +# define IMPORT __declspec(dllimport) +# define EXPORT __declspec(dllexport) +#else +# define IMPORT +# define EXPORT +#endif + +IMPORT int curand_main(); +IMPORT int nppif_main(); + +EXPORT int static_version() +{ + return curand_main() == 0 && nppif_main() == 0; +} diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt b/Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt new file mode 100644 index 0000000..97ac229 --- /dev/null +++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required(VERSION 3.15) +project(StaticRuntimePlusToolkit CUDA) + +#Goal for this example: +# Validate that with cuda we can use some components of the CUDA toolkit, and +# specify the cuda runtime +find_package(CUDAToolkit REQUIRED) + +add_library(Common OBJECT curand.cu nppif.cu) +target_link_libraries(Common PRIVATE CUDA::toolkit) +set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON) + +#static runtime with shared toolkit libraries +add_library(SharedToolkit SHARED shared.cu) +target_link_libraries(SharedToolkit PRIVATE Common CUDA::curand CUDA::nppif ) +set_target_properties(SharedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY none) +target_link_libraries(SharedToolkit PUBLIC CUDA::cudart_static) + +#static runtime with static toolkit libraries +add_library(StaticToolkit SHARED static.cu) +target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static) + +#static runtime with mixed toolkit libraries +add_library(MixedToolkit SHARED mixed.cu) +target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand CUDA::nppif_static) +set_target_properties(MixedToolkit PROPERTIES CUDA_RUNTIME_LIBRARY Static) + +add_executable(CudaOnlyStaticRuntimePlusToolkit main.cu) +target_link_libraries(CudaOnlyStaticRuntimePlusToolkit PRIVATE SharedToolkit StaticToolkit MixedToolkit) diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu new file mode 100644 index 0000000..95872f0 --- /dev/null +++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/curand.cu @@ -0,0 +1,59 @@ +// Comes from: +// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example + +/* + * This program uses the host CURAND API to generate 100 + * pseudorandom floats. + */ +#include <cuda.h> +#include <curand.h> +#include <stdio.h> +#include <stdlib.h> + +#define CUDA_CALL(x) \ + do { \ + if ((x) != cudaSuccess) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + return EXIT_FAILURE; \ + } \ + } while (0) +#define CURAND_CALL(x) \ + do { \ + if ((x) != CURAND_STATUS_SUCCESS) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + return EXIT_FAILURE; \ + } \ + } while (0) + +int curand_main() +{ + size_t n = 100; + size_t i; + curandGenerator_t gen; + float *devData, *hostData; + + /* Allocate n floats on host */ + hostData = (float*)calloc(n, sizeof(float)); + + /* Allocate n floats on device */ + CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float))); + + /* Create pseudo-random number generator */ + CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); + + /* Set seed */ + CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL)); + + /* Generate n floats on device */ + CURAND_CALL(curandGenerateUniform(gen, devData, n)); + + /* Copy device memory to host */ + CUDA_CALL( + cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost)); + + /* Cleanup */ + CURAND_CALL(curandDestroyGenerator(gen)); + CUDA_CALL(cudaFree(devData)); + free(hostData); + return EXIT_SUCCESS; +} diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu new file mode 100644 index 0000000..5a09f8e --- /dev/null +++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/main.cu @@ -0,0 +1,11 @@ + + +int shared_version(); +int static_version(); +int mixed_version(); + +int main() +{ + return mixed_version() == 0 && shared_version() == 0 && + static_version() == 0; +} diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu new file mode 100644 index 0000000..a05140d --- /dev/null +++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/mixed.cu @@ -0,0 +1,8 @@ + +int curand_main(); +int nppif_main(); + +int mixed_version() +{ + return curand_main() == 0 && nppif_main() == 0; +} diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu new file mode 100644 index 0000000..2871090 --- /dev/null +++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/nppif.cu @@ -0,0 +1,86 @@ +// Comes from +// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066 + +#include <cstdio> +#include <iostream> + +#include <assert.h> +#include <cuda_runtime_api.h> +#include <nppi_filtering_functions.h> + +int nppif_main() +{ + /** + * 8-bit unsigned single-channel 1D row convolution. + */ + const int simgrows = 32; + const int simgcols = 32; + Npp8u *d_pSrc, *d_pDst; + const int nMaskSize = 3; + NppiSize oROI; + oROI.width = simgcols - nMaskSize; + oROI.height = simgrows; + const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]); + const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]); + const int simgpix = simgrows * simgcols; + const int dimgpix = oROI.width * oROI.height; + const int nSrcStep = simgcols * sizeof(d_pSrc[0]); + const int nDstStep = oROI.width * sizeof(d_pDst[0]); + const int pixval = 1; + const int nDivisor = 1; + const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval }; + Npp32s* d_pKernel; + const Npp32s nAnchor = 2; + cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + err = cudaMalloc((void**)&d_pDst, dimgsize); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0])); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + // set image to pixval initially + err = cudaMemset(d_pSrc, pixval, simgsize); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + err = cudaMemset(d_pDst, 0, dimgsize); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]), + cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + // copy src to dst + NppStatus ret = + nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel, + nMaskSize, nAnchor, nDivisor); + assert(ret == NPP_NO_ERROR); + Npp8u* h_imgres = new Npp8u[dimgpix]; + err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + fprintf(stderr, "Cuda error %d\n", __LINE__); + return 1; + } + // test for filtering + for (int i = 0; i < dimgpix; i++) { + if (h_imgres[i] != (pixval * pixval * nMaskSize)) { + fprintf(stderr, "h_imgres at index %d failed to match\n", i); + return 1; + } + } + + return 0; +} diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu new file mode 100644 index 0000000..9967b66 --- /dev/null +++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/shared.cu @@ -0,0 +1,8 @@ + +int curand_main(); +int nppif_main(); + +int shared_version() +{ + return curand_main() == 0 && nppif_main() == 0; +} diff --git a/Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu b/Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu new file mode 100644 index 0000000..ca7eb4c --- /dev/null +++ b/Tests/CudaOnly/StaticRuntimePlusToolkit/static.cu @@ -0,0 +1,8 @@ + +int curand_main(); +int nppif_main(); + +int static_version() +{ + return curand_main() == 0 && nppif_main() == 0; +} |