summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Help/release/dev/cuda-ptx-separable-compilation.rst5
-rw-r--r--Modules/CMakeCUDAInformation.cmake19
-rw-r--r--Modules/Compiler/Clang-CUDA.cmake3
-rw-r--r--Modules/Compiler/NVIDIA-CUDA.cmake3
-rw-r--r--Modules/Platform/Windows-NVIDIA-CUDA.cmake8
-rw-r--r--Source/cmMakefileTargetGenerator.cxx33
-rw-r--r--Source/cmNinjaTargetGenerator.cxx34
-rw-r--r--Source/cmRulePlaceholderExpander.cxx5
-rw-r--r--Source/cmRulePlaceholderExpander.h1
-rw-r--r--Source/cmVisualStudio10TargetGenerator.cxx7
-rw-r--r--Tests/CudaOnly/CMakeLists.txt1
-rw-r--r--Tests/CudaOnly/ExportPTX/CMakeLists.txt2
-rw-r--r--Tests/CudaOnly/SeparateCompilationPTX/CMakeLists.txt51
-rw-r--r--Tests/CudaOnly/SeparateCompilationPTX/kernels.cu14
-rw-r--r--Tests/CudaOnly/SeparateCompilationPTX/main.cu30
-rw-r--r--Tests/CudaOnly/utils/bin2c_wrapper.cmake (renamed from Tests/CudaOnly/ExportPTX/bin2c_wrapper.cmake)0
16 files changed, 158 insertions, 58 deletions
diff --git a/Help/release/dev/cuda-ptx-separable-compilation.rst b/Help/release/dev/cuda-ptx-separable-compilation.rst
new file mode 100644
index 0000000..49c66c0
--- /dev/null
+++ b/Help/release/dev/cuda-ptx-separable-compilation.rst
@@ -0,0 +1,5 @@
+cuda-ptx-separable-compilation
+------------------------------
+
+* ``CUDA`` targets can now enable both :prop_tgt:`CUDA_SEPARABLE_COMPILATION` and
+ :prop_tgt:`CUDA_PTX_COMPILATION`.
diff --git a/Modules/CMakeCUDAInformation.cmake b/Modules/CMakeCUDAInformation.cmake
index e9cfed6..94b67dc 100644
--- a/Modules/CMakeCUDAInformation.cmake
+++ b/Modules/CMakeCUDAInformation.cmake
@@ -160,22 +160,9 @@ if(NOT DEFINED CMAKE_CUDA_ARCHIVE_FINISH)
set(CMAKE_CUDA_ARCHIVE_FINISH "<CMAKE_RANLIB> <TARGET>")
endif()
-#Specify how to compile when ptx has been requested
-if(NOT CMAKE_CUDA_COMPILE_PTX_COMPILATION)
- set(CMAKE_CUDA_COMPILE_PTX_COMPILATION
- "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} ${_CMAKE_CUDA_PTX_FLAG} <SOURCE> -o <OBJECT>")
-endif()
-
-#Specify how to compile when separable compilation has been requested
-if(NOT CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION)
- set(CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION
- "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} ${_CMAKE_CUDA_DEVICE_CODE} <SOURCE> -o <OBJECT>")
-endif()
-
-#Specify how to compile when whole compilation has been requested
-if(NOT CMAKE_CUDA_COMPILE_WHOLE_COMPILATION)
- set(CMAKE_CUDA_COMPILE_WHOLE_COMPILATION
- "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} -c <SOURCE> -o <OBJECT>")
+if(NOT CMAKE_CUDA_COMPILE_OBJECT)
+ set(CMAKE_CUDA_COMPILE_OBJECT
+ "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} <CUDA_COMPILE_MODE> <SOURCE> -o <OBJECT>")
endif()
# compile a cu file into an executable
diff --git a/Modules/Compiler/Clang-CUDA.cmake b/Modules/Compiler/Clang-CUDA.cmake
index 0223081..b105518 100644
--- a/Modules/Compiler/Clang-CUDA.cmake
+++ b/Modules/Compiler/Clang-CUDA.cmake
@@ -18,8 +18,9 @@ __compiler_clang_cxx_standards(CUDA)
set(CMAKE_CUDA_COMPILER_HAS_DEVICE_LINK_PHASE TRUE)
set(_CMAKE_COMPILE_AS_CUDA_FLAG "-x cuda")
+set(_CMAKE_CUDA_WHOLE_FLAG "-c")
+set(_CMAKE_CUDA_RDC_FLAG "-fgpu-rdc")
set(_CMAKE_CUDA_PTX_FLAG "--cuda-device-only -S")
-set(_CMAKE_CUDA_DEVICE_CODE "-fgpu-rdc -c")
# RulePlaceholderExpander expands crosscompile variables like sysroot and target only for CMAKE_<LANG>_COMPILER. Override the default.
set(CMAKE_CUDA_LINK_EXECUTABLE "<CMAKE_CUDA_COMPILER> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>${__IMPLICIT_LINKS}")
diff --git a/Modules/Compiler/NVIDIA-CUDA.cmake b/Modules/Compiler/NVIDIA-CUDA.cmake
index c2fe42d..2f12b43 100644
--- a/Modules/Compiler/NVIDIA-CUDA.cmake
+++ b/Modules/Compiler/NVIDIA-CUDA.cmake
@@ -5,8 +5,9 @@ set(CMAKE_CUDA_VERBOSE_FLAG "-v")
set(CMAKE_CUDA_VERBOSE_COMPILE_FLAG "-Xcompiler=-v")
set(_CMAKE_COMPILE_AS_CUDA_FLAG "-x cu")
+set(_CMAKE_CUDA_WHOLE_FLAG "-c")
+set(_CMAKE_CUDA_RDC_FLAG "-rdc=true")
set(_CMAKE_CUDA_PTX_FLAG "-ptx")
-set(_CMAKE_CUDA_DEVICE_CODE "-dc")
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 10.2.89)
# The -forward-unknown-to-host-compiler flag was only
diff --git a/Modules/Platform/Windows-NVIDIA-CUDA.cmake b/Modules/Platform/Windows-NVIDIA-CUDA.cmake
index b83932e..6c1699b 100644
--- a/Modules/Platform/Windows-NVIDIA-CUDA.cmake
+++ b/Modules/Platform/Windows-NVIDIA-CUDA.cmake
@@ -1,11 +1,7 @@
include(Platform/Windows-MSVC)
-set(CMAKE_CUDA_COMPILE_PTX_COMPILATION
- "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} -ptx <SOURCE> -o <OBJECT> -Xcompiler=-Fd<TARGET_COMPILE_PDB>,-FS")
-set(CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION
- "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} -dc <SOURCE> -o <OBJECT> -Xcompiler=-Fd<TARGET_COMPILE_PDB>,-FS")
-set(CMAKE_CUDA_COMPILE_WHOLE_COMPILATION
- "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} -c <SOURCE> -o <OBJECT> -Xcompiler=-Fd<TARGET_COMPILE_PDB>,-FS")
+set(CMAKE_CUDA_COMPILE_OBJECT
+ "<CMAKE_CUDA_COMPILER> ${_CMAKE_CUDA_EXTRA_FLAGS} <DEFINES> <INCLUDES> <FLAGS> ${_CMAKE_COMPILE_AS_CUDA_FLAG} <CUDA_COMPILE_MODE> <SOURCE> -o <OBJECT> -Xcompiler=-Fd<TARGET_COMPILE_PDB>,-FS")
set(__IMPLICIT_LINKS)
foreach(dir ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES})
diff --git a/Source/cmMakefileTargetGenerator.cxx b/Source/cmMakefileTargetGenerator.cxx
index 8edadd3..9f2ae19 100644
--- a/Source/cmMakefileTargetGenerator.cxx
+++ b/Source/cmMakefileTargetGenerator.cxx
@@ -897,28 +897,31 @@ void cmMakefileTargetGenerator::WriteObjectRuleFiles(
// Construct the compile rules.
{
- std::vector<std::string> compileCommands;
+ std::string cudaCompileMode;
if (lang == "CUDA") {
- std::string cmdVar;
if (this->GeneratorTarget->GetPropertyAsBool(
"CUDA_SEPARABLE_COMPILATION")) {
- cmdVar = "CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION";
- } else if (this->GeneratorTarget->GetPropertyAsBool(
- "CUDA_PTX_COMPILATION")) {
- cmdVar = "CMAKE_CUDA_COMPILE_PTX_COMPILATION";
+ const std::string& rdcFlag =
+ this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_RDC_FLAG");
+ cudaCompileMode = cmStrCat(cudaCompileMode, rdcFlag, " ");
+ }
+ if (this->GeneratorTarget->GetPropertyAsBool("CUDA_PTX_COMPILATION")) {
+ const std::string& ptxFlag =
+ this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_PTX_FLAG");
+ cudaCompileMode = cmStrCat(cudaCompileMode, ptxFlag);
} else {
- cmdVar = "CMAKE_CUDA_COMPILE_WHOLE_COMPILATION";
+ const std::string& wholeFlag =
+ this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_WHOLE_FLAG");
+ cudaCompileMode = cmStrCat(cudaCompileMode, wholeFlag);
}
- const std::string& compileRule =
- this->Makefile->GetRequiredDefinition(cmdVar);
- cmExpandList(compileRule, compileCommands);
- } else {
- const std::string cmdVar = "CMAKE_" + lang + "_COMPILE_OBJECT";
- const std::string& compileRule =
- this->Makefile->GetRequiredDefinition(cmdVar);
- cmExpandList(compileRule, compileCommands);
+ vars.CudaCompileMode = cudaCompileMode.c_str();
}
+ std::vector<std::string> compileCommands;
+ const std::string& compileRule = this->Makefile->GetRequiredDefinition(
+ "CMAKE_" + lang + "_COMPILE_OBJECT");
+ cmExpandList(compileRule, compileCommands);
+
if (this->GeneratorTarget->GetPropertyAsBool("EXPORT_COMPILE_COMMANDS") &&
lang_can_export_cmds && compileCommands.size() == 1) {
std::string compileCommand = compileCommands[0];
diff --git a/Source/cmNinjaTargetGenerator.cxx b/Source/cmNinjaTargetGenerator.cxx
index 57fc020..2c45e1b 100644
--- a/Source/cmNinjaTargetGenerator.cxx
+++ b/Source/cmNinjaTargetGenerator.cxx
@@ -605,6 +605,7 @@ void cmNinjaTargetGenerator::WriteCompileRule(const std::string& lang,
vars.TargetCompilePDB = "$TARGET_COMPILE_PDB";
vars.ObjectDir = "$OBJECT_DIR";
vars.ObjectFileDir = "$OBJECT_FILE_DIR";
+ vars.CudaCompileMode = "$CUDA_COMPILE_MODE";
vars.ISPCHeader = "$ISPC_HEADER_FILE";
cmMakefile* mf = this->GetMakefile();
@@ -815,27 +816,32 @@ void cmNinjaTargetGenerator::WriteCompileRule(const std::string& lang,
vars.Flags = flags.c_str();
vars.DependencyFile = rule.DepFile.c_str();
- // Rule for compiling object file.
- std::vector<std::string> compileCmds;
+ std::string cudaCompileMode;
if (lang == "CUDA") {
- std::string cmdVar;
if (this->GeneratorTarget->GetPropertyAsBool(
"CUDA_SEPARABLE_COMPILATION")) {
- cmdVar = "CMAKE_CUDA_COMPILE_SEPARABLE_COMPILATION";
- } else if (this->GeneratorTarget->GetPropertyAsBool(
- "CUDA_PTX_COMPILATION")) {
- cmdVar = "CMAKE_CUDA_COMPILE_PTX_COMPILATION";
+ const std::string& rdcFlag =
+ this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_RDC_FLAG");
+ cudaCompileMode = cmStrCat(cudaCompileMode, rdcFlag, " ");
+ }
+ if (this->GeneratorTarget->GetPropertyAsBool("CUDA_PTX_COMPILATION")) {
+ const std::string& ptxFlag =
+ this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_PTX_FLAG");
+ cudaCompileMode = cmStrCat(cudaCompileMode, ptxFlag);
} else {
- cmdVar = "CMAKE_CUDA_COMPILE_WHOLE_COMPILATION";
+ const std::string& wholeFlag =
+ this->Makefile->GetRequiredDefinition("_CMAKE_CUDA_WHOLE_FLAG");
+ cudaCompileMode = cmStrCat(cudaCompileMode, wholeFlag);
}
- const std::string& compileCmd = mf->GetRequiredDefinition(cmdVar);
- cmExpandList(compileCmd, compileCmds);
- } else {
- const std::string cmdVar = cmStrCat("CMAKE_", lang, "_COMPILE_OBJECT");
- const std::string& compileCmd = mf->GetRequiredDefinition(cmdVar);
- cmExpandList(compileCmd, compileCmds);
+ vars.CudaCompileMode = cudaCompileMode.c_str();
}
+ // Rule for compiling object file.
+ std::vector<std::string> compileCmds;
+ const std::string cmdVar = cmStrCat("CMAKE_", lang, "_COMPILE_OBJECT");
+ const std::string& compileCmd = mf->GetRequiredDefinition(cmdVar);
+ cmExpandList(compileCmd, compileCmds);
+
// See if we need to use a compiler launcher like ccache or distcc
std::string compilerLauncher;
if (!compileCmds.empty() &&
diff --git a/Source/cmRulePlaceholderExpander.cxx b/Source/cmRulePlaceholderExpander.cxx
index 7480aeb..4cee09d 100644
--- a/Source/cmRulePlaceholderExpander.cxx
+++ b/Source/cmRulePlaceholderExpander.cxx
@@ -85,6 +85,11 @@ std::string cmRulePlaceholderExpander::ExpandRuleVariable(
return replaceValues.ObjectsQuoted;
}
}
+ if (replaceValues.CudaCompileMode) {
+ if (variable == "CUDA_COMPILE_MODE") {
+ return replaceValues.CudaCompileMode;
+ }
+ }
if (replaceValues.AIXExports) {
if (variable == "AIX_EXPORTS") {
return replaceValues.AIXExports;
diff --git a/Source/cmRulePlaceholderExpander.h b/Source/cmRulePlaceholderExpander.h
index c22e0fa..852954f 100644
--- a/Source/cmRulePlaceholderExpander.h
+++ b/Source/cmRulePlaceholderExpander.h
@@ -65,6 +65,7 @@ public:
const char* SwiftOutputFileMap = nullptr;
const char* SwiftSources = nullptr;
const char* ISPCHeader = nullptr;
+ const char* CudaCompileMode = nullptr;
const char* Fatbinary = nullptr;
const char* RegisterFile = nullptr;
const char* Launcher = nullptr;
diff --git a/Source/cmVisualStudio10TargetGenerator.cxx b/Source/cmVisualStudio10TargetGenerator.cxx
index b792c03..4e93a4a 100644
--- a/Source/cmVisualStudio10TargetGenerator.cxx
+++ b/Source/cmVisualStudio10TargetGenerator.cxx
@@ -3211,18 +3211,17 @@ bool cmVisualStudio10TargetGenerator::ComputeCudaOptions(
// the default to not have any extension
cudaOptions.AddFlag("CompileOut", "$(IntDir)%(Filename).obj");
- bool notPtx = true;
if (this->GeneratorTarget->GetPropertyAsBool("CUDA_SEPARABLE_COMPILATION")) {
cudaOptions.AddFlag("GenerateRelocatableDeviceCode", "true");
- } else if (this->GeneratorTarget->GetPropertyAsBool(
- "CUDA_PTX_COMPILATION")) {
+ }
+ bool notPtx = true;
+ if (this->GeneratorTarget->GetPropertyAsBool("CUDA_PTX_COMPILATION")) {
cudaOptions.AddFlag("NvccCompilation", "ptx");
// We drop the %(Extension) component as CMake expects all PTX files
// to not have the source file extension at all
cudaOptions.AddFlag("CompileOut", "$(IntDir)%(Filename).ptx");
notPtx = false;
}
-
if (notPtx &&
cmSystemTools::VersionCompareGreaterEq(
"8.0", this->GlobalGenerator->GetPlatformToolsetCudaString())) {
diff --git a/Tests/CudaOnly/CMakeLists.txt b/Tests/CudaOnly/CMakeLists.txt
index a3fb409..65dfebb 100644
--- a/Tests/CudaOnly/CMakeLists.txt
+++ b/Tests/CudaOnly/CMakeLists.txt
@@ -16,6 +16,7 @@ add_cuda_test_macro(CudaOnly.WithDefs CudaOnlyWithDefs)
add_cuda_test_macro(CudaOnly.CircularLinkLine CudaOnlyCircularLinkLine)
add_cuda_test_macro(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)
add_cuda_test_macro(CudaOnly.SeparateCompilation main/CudaOnlySeparateCompilation)
+add_cuda_test_macro(CudaOnly.SeparateCompilationPTX CudaOnlySeparateCompilationPTX)
if(CMake_TEST_CUDA AND NOT CMake_TEST_CUDA STREQUAL "Clang")
# Clang doesn't have flags for selecting the runtime.
diff --git a/Tests/CudaOnly/ExportPTX/CMakeLists.txt b/Tests/CudaOnly/ExportPTX/CMakeLists.txt
index e7e7bc4..f1667af 100644
--- a/Tests/CudaOnly/ExportPTX/CMakeLists.txt
+++ b/Tests/CudaOnly/ExportPTX/CMakeLists.txt
@@ -56,7 +56,7 @@ add_custom_command(
"-DBIN_TO_C_COMMAND=${bin_to_c}"
"-DOBJECTS=$<TARGET_OBJECTS:CudaPTX>"
"-DOUTPUT=${output_file}"
- -P ${CMAKE_CURRENT_SOURCE_DIR}/bin2c_wrapper.cmake
+ -P ${CMAKE_CURRENT_SOURCE_DIR}/../utils/bin2c_wrapper.cmake
VERBATIM
DEPENDS $<TARGET_OBJECTS:CudaPTX>
COMMENT "Converting Object files to a C header"
diff --git a/Tests/CudaOnly/SeparateCompilationPTX/CMakeLists.txt b/Tests/CudaOnly/SeparateCompilationPTX/CMakeLists.txt
new file mode 100644
index 0000000..273f955
--- /dev/null
+++ b/Tests/CudaOnly/SeparateCompilationPTX/CMakeLists.txt
@@ -0,0 +1,51 @@
+cmake_minimum_required(VERSION 3.19)
+project (SeparateCompPTX CUDA)
+
+#Goal for this example:
+# How to generate PTX files with RDC enabled
+
+# PTX can be compiled only for a single virtual architecture at a time
+list(POP_FRONT CMAKE_CUDA_ARCHITECTURES temp)
+set(CMAKE_CUDA_ARCHITECTURES ${temp})
+string(APPEND CMAKE_CUDA_ARCHITECTURES "-virtual")
+
+add_library(CudaPTX OBJECT kernels.cu)
+set_property(TARGET CudaPTX PROPERTY CUDA_PTX_COMPILATION ON)
+set_property(TARGET CudaPTX PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
+
+set(output_file ${CMAKE_CURRENT_BINARY_DIR}/embedded_objs.h)
+
+find_package(CUDAToolkit REQUIRED)
+find_program(bin_to_c
+ NAMES bin2c
+ PATHS ${CUDAToolkit_BIN_DIR}
+ )
+if(NOT bin_to_c)
+ message(FATAL_ERROR
+ "bin2c not found:\n"
+ " CUDAToolkit_BIN_DIR='${CUDAToolkit_BIN_DIR}'\n"
+ )
+endif()
+
+add_custom_command(
+ OUTPUT "${output_file}"
+ COMMAND ${CMAKE_COMMAND}
+ "-DBIN_TO_C_COMMAND=${bin_to_c}"
+ "-DOBJECTS=$<TARGET_OBJECTS:CudaPTX>"
+ "-DOUTPUT=${output_file}"
+ -P ${CMAKE_CURRENT_SOURCE_DIR}/../utils/bin2c_wrapper.cmake
+ VERBATIM
+ DEPENDS $<TARGET_OBJECTS:CudaPTX>
+ COMMENT "Converting Object files to a C header"
+ )
+
+add_executable(CudaOnlySeparateCompilationPTX main.cu ${output_file})
+target_compile_features(CudaOnlySeparateCompilationPTX PRIVATE cuda_std_11)
+target_include_directories(CudaOnlySeparateCompilationPTX PRIVATE
+ ${CMAKE_CURRENT_BINARY_DIR} )
+target_link_libraries(CudaOnlySeparateCompilationPTX PRIVATE CUDA::cuda_driver)
+if(APPLE)
+ # Help the static cuda runtime find the driver (libcuda.dyllib) at runtime.
+ set_property(TARGET CudaOnlySeparateCompilationPTX PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
diff --git a/Tests/CudaOnly/SeparateCompilationPTX/kernels.cu b/Tests/CudaOnly/SeparateCompilationPTX/kernels.cu
new file mode 100644
index 0000000..f4a52d4
--- /dev/null
+++ b/Tests/CudaOnly/SeparateCompilationPTX/kernels.cu
@@ -0,0 +1,14 @@
+
+__global__ void kernelA(float* r, float* x, float* y, float* z, int size)
+{
+ for (int i = threadIdx.x; i < size; i += blockDim.x) {
+ r[i] = x[i] * y[i] + z[i];
+ }
+}
+
+__global__ void kernelB(float* r, float* x, float* y, float* z, int size)
+{
+ for (int i = threadIdx.x; i < size; i += blockDim.x) {
+ r[i] = x[i] * y[i] + z[i];
+ }
+}
diff --git a/Tests/CudaOnly/SeparateCompilationPTX/main.cu b/Tests/CudaOnly/SeparateCompilationPTX/main.cu
new file mode 100644
index 0000000..164cde5
--- /dev/null
+++ b/Tests/CudaOnly/SeparateCompilationPTX/main.cu
@@ -0,0 +1,30 @@
+#include <iostream>
+
+#include <cuda.h>
+
+#include "embedded_objs.h"
+
+int main()
+{
+ cuInit(0);
+ int count = 0;
+ cuDeviceGetCount(&count);
+ if (count == 0) {
+ std::cerr << "No CUDA devices found\n";
+ return 1;
+ }
+
+ CUdevice device;
+ cuDeviceGet(&device, 0);
+
+ CUcontext context;
+ cuCtxCreate(&context, 0, device);
+
+ CUmodule module;
+ cuModuleLoadData(&module, kernels);
+ if (module == nullptr) {
+ std::cerr << "Failed to load the embedded ptx" << std::endl;
+ return 1;
+ }
+ std::cout << module << std::endl;
+}
diff --git a/Tests/CudaOnly/ExportPTX/bin2c_wrapper.cmake b/Tests/CudaOnly/utils/bin2c_wrapper.cmake
index 0baf934..0baf934 100644
--- a/Tests/CudaOnly/ExportPTX/bin2c_wrapper.cmake
+++ b/Tests/CudaOnly/utils/bin2c_wrapper.cmake